diff --git a/clang/test/CodeGen/AArch64/sme-remarks.c b/clang/test/CodeGen/AArch64/sme-remarks.c index fd144b8a6c425..f7a1f33f3372d 100644 --- a/clang/test/CodeGen/AArch64/sme-remarks.c +++ b/clang/test/CodeGen/AArch64/sme-remarks.c @@ -1,39 +1,39 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -Rpass-analysis=sme -verify %s -S -o /dev/null -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -mllvm -aarch64-new-sme-abi -Rpass-analysis=sme -verify=expected-new %s -S -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -mllvm -aarch64-new-sme-abi=false -Rpass-analysis=sme -verify=expected-sdag %s -S -o /dev/null +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -Rpass-analysis=sme -verify %s -S -o /dev/null %s void private_za_callee_a(); void private_za_callee_b(); void private_za_callee_c(); void test_za_merge_paths(int a) __arm_inout("za") { - // expected-new-remark@+1 {{lazy save of ZA emitted in 'test_za_merge_paths'}} + // expected-remark@+1 {{lazy save of ZA emitted in 'test_za_merge_paths'}} if (a != 0) - // expected-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-new-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} + // expected-sdag-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} private_za_callee_a(); else - // expected-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-new-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} + // expected-sdag-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} private_za_callee_b(); - // expected-remark@+3 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-sdag-remark@+3 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} /// The new lowering won't report this call as the save is already needed due /// to the call to `private_za_callee_a/b()` calls on both paths to this call. private_za_callee_c(); } void test_lazy_save_multiple_paths(int a) __arm_inout("za") { - // expected-new-remark@+1 {{lazy save of ZA emitted in 'test_lazy_save_multiple_paths'}} + // expected-remark@+1 {{lazy save of ZA emitted in 'test_lazy_save_multiple_paths'}} if (a != 0) - // expected-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-new-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} + // expected-sdag-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} private_za_callee_a(); else { - // expected-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-new-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} + // expected-sdag-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} private_za_callee_b(); - // expected-remark@+3 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} + // expected-sdag-remark@+3 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} /// The new lowering won't report this call as the save is already needed /// due to the call to `private_za_callee_b()`. private_za_callee_c(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 346e18e553c5e..1ec5a20cc0ce0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -225,7 +225,7 @@ static cl::opt static cl::opt EnableNewSMEABILowering("aarch64-new-sme-abi", cl::desc("Enable new lowering for the SME ABI"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index 96f5e5a4afb3e..80ff4fbb11a8f 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -33,7 +33,6 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: AArch64 Stack Tagging -; CHECK-NEXT: SME ABI Pass ; CHECK-NEXT: Exception handling preparation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass @@ -56,6 +55,10 @@ ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: Bundle Machine CFG Edges +; CHECK-NEXT: Lazy Machine Block Frequency Analysis +; CHECK-NEXT: Machine Optimization Remark Emitter +; CHECK-NEXT: Machine SME ABI pass ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index e8ea55e027aec..15266b0d6a916 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -97,8 +97,6 @@ ; CHECK-NEXT: Interleaved Load Combine Pass ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Interleaved Access Pass -; CHECK-NEXT: SME ABI Pass -; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare @@ -129,8 +127,11 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions -; CHECK-NEXT: SME Peephole Optimization pass +; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Lazy Machine Block Frequency Analysis +; CHECK-NEXT: Machine Optimization Remark Emitter +; CHECK-NEXT: Machine SME ABI pass +; CHECK-NEXT: SME Peephole Optimization pass ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering diff --git a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll index 755dcfbf17ba4..c3c76e3e803d0 100644 --- a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll +++ b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sme2 --aarch64-new-sme-abi=false --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SDAG ; RUN: llc -mtriple=aarch64 -mattr=+sme2 --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s -; RUN: llc -mtriple=aarch64 -mattr=+sme2 --aarch64-new-sme-abi --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NEWLOWERING declare void @private_za_callee() declare void @private_za_callee_a() @@ -13,42 +13,42 @@ declare void @shared_za_zt0_callee() "aarch64_inout_za" "aarch64_inout_zt0" ; Note: These remarks are more useful with source debug info (which gives line numbers for `:0:0`). define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { -; CHECK: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_1_callee' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save +; CHECK: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_1_callee' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save call void @private_za_callee() ret void } define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { -; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA -; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_2_callees' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save +; CHECK: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_2_callees' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save call void @private_za_callee() call void @private_za_callee() ret void } define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { -; CHECK: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_expanded_intrinsic' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'cosf' requires ZA save +; CHECK: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_expanded_intrinsic' +; CHECK-NEXT: remark: :0:0: call to 'cosf' requires ZA save %res = call float @llvm.cos.f32(float %a) ret float %res } define void @test_lazy_save_multiple_paths(i1 %a) "aarch64_inout_za" { -; CHECK: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_a' sets up a lazy save for ZA -; CHECK: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_b' sets up a lazy save for ZA -; CHECK: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_c' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_a' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_b' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_c' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_multiple_paths' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee_b' requires ZA save -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee_a' requires ZA save +; CHECK: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_multiple_paths' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee_b' requires ZA save +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee_a' requires ZA save entry: br i1 %a, label %if.end, label %if.else @@ -67,12 +67,12 @@ if.end: define void @test_lazy_save_with_zt0() "aarch64_inout_za" "aarch64_inout_zt0" { -; CHECK: remark: :0:0: call from 'test_lazy_save_with_zt0' to 'private_za_callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_with_zt0' to 'private_za_callee' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'shared_za_callee' requires ZT0 save -; CHECK-NEWLOWERING-NEXT: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save +; CHECK: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0' +; CHECK-NEXT: remark: :0:0: call to 'shared_za_callee' requires ZT0 save +; CHECK-NEXT: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill) call void @private_za_callee() ; Save ZA (remark ZA save) ret void @@ -80,13 +80,13 @@ define void @test_lazy_save_with_zt0() "aarch64_inout_za" "aarch64_inout_zt0" define void @test_lazy_save_with_zt0_reload() "aarch64_inout_za" "aarch64_inout_zt0" { -; CHECK: remark: :0:0: call from 'test_lazy_save_with_zt0_reload' to 'private_za_callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_with_zt0_reload' to 'private_za_callee' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'shared_za_callee' requires ZT0 save -; CHECK-NEWLOWERING-NEXT: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save +; CHECK: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' +; CHECK-NEXT: remark: :0:0: call to 'shared_za_callee' requires ZT0 save +; CHECK-NEXT: remark: :0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' +; CHECK-NEXT: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0_reload' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee' requires ZA save call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill) call void @shared_za_zt0_callee() ; Reload ZT0 call void @private_za_callee() ; Save ZA, ZT0 (remark ZT0 spill and ZA save) @@ -96,9 +96,9 @@ define void @test_lazy_save_with_zt0_reload() "aarch64_inout_za" "aarch64_inout_ define void @test_za_merge_paths(i1 %a) "aarch64_za_state_agnostic" { ;; Note: The old lowering does not emit any remarks for agnostic ZA saves. -; CHECK-NEWLOWERING: remark: :0:0: full save of ZA emitted in 'test_za_merge_paths' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee_b' requires ZA save -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call to 'private_za_callee_a' requires ZA save +; CHECK: remark: :0:0: full save of ZA emitted in 'test_za_merge_paths' +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee_b' requires ZA save +; CHECK-NEXT: remark: :0:0: call to 'private_za_callee_a' requires ZA save entry: br i1 %a, label %if.end, label %if.else @@ -119,10 +119,10 @@ exit: } define void @test_lazy_save_function_ptr_callee(ptr %private_za_callee) nounwind "aarch64_inout_za" { -; CHECK: remark: :0:0: call from 'test_lazy_save_function_ptr_callee' to 'unknown callee' sets up a lazy save for ZA +; CHECK-SDAG: remark: :0:0: call from 'test_lazy_save_function_ptr_callee' to 'unknown callee' sets up a lazy save for ZA -; CHECK-NEWLOWERING: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_function_ptr_callee' -; CHECK-NEWLOWERING-NEXT: remark: :0:0: call requires ZA save +; CHECK: remark: :0:0: lazy save of ZA emitted in 'test_lazy_save_function_ptr_callee' +; CHECK-NEXT: remark: :0:0: call requires ZA save call void %private_za_callee() ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 0906e10b551b7..344f1ef24b843 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi=false | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING target triple = "aarch64" @@ -24,6 +24,35 @@ define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agno ; inserted for calls to non-agnostic functions and that the arg/result registers are ; preserved by the register allocator. define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" { +; CHECK-SDAG-LABEL: agnostic_caller_private_za_callee: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: agnostic_caller_private_za_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -37,12 +66,6 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: bl private_za_decl -; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: bl private_za_decl ; CHECK-NEXT: mov x1, x0 ; CHECK-NEXT: mov x0, x19 @@ -52,29 +75,6 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: mov x8, x0 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size -; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: mov x0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -110,6 +110,47 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a ; agnostic-ZA + streaming -> private-ZA + non-streaming define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" { +; CHECK-SDAG-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: add x29, sp, #64 +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x20, sp +; CHECK-SDAG-NEXT: mov x0, x20 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: mov x0, x20 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x20 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: mov x0, x20 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: sub sp, x29, #64 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill @@ -128,14 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEXT: smstop sm ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: bl private_za_decl -; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: smstart sm -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: smstop sm -; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: bl private_za_decl ; CHECK-NEXT: mov x1, x0 ; CHECK-NEXT: smstart sm @@ -150,39 +183,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x8, x0 -; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size -; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 -; CHECK-NEWLOWERING-NEXT: mov x20, sp -; CHECK-NEWLOWERING-NEXT: mov x0, x20 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: mov x0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: mov x0, x20 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -190,6 +190,60 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" { +; CHECK-SDAG-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: add x29, sp, #64 +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mrs x20, SVCR +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: .LBB5_2: +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_4 +; CHECK-SDAG-NEXT: // %bb.3: +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: .LBB5_4: +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_6 +; CHECK-SDAG-NEXT: // %bb.5: +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: .LBB5_6: +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl private_za_decl +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_8 +; CHECK-SDAG-NEXT: // %bb.7: +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: .LBB5_8: +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: sub sp, x29, #64 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill @@ -200,10 +254,10 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: sub sp, sp, x0 ; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: tbz w20, #0, .LBB5_2 @@ -212,6 +266,7 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: bl private_za_decl ; CHECK-NEXT: mov x1, x0 ; CHECK-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEXT: // %bb.3: @@ -219,21 +274,6 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: tbz w20, #0, .LBB5_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: smstop sm -; CHECK-NEXT: .LBB5_6: -; CHECK-NEXT: mov x0, x1 -; CHECK-NEXT: bl private_za_decl -; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: tbz w20, #0, .LBB5_8 -; CHECK-NEXT: // %bb.7: -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB5_8: -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: sub sp, x29, #64 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload @@ -243,46 +283,6 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x8, x0 -; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size -; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: .LBB5_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: .LBB5_4: -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -295,6 +295,31 @@ declare i64 @many_args_private_za_callee( ; stack pointer before the call -- in this test the call to __arm_sme_save ; should occur _before_ the stack decrement. define i64 @test_many_callee_arguments( +; CHECK-SDAG-LABEL: test_many_callee_arguments: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: ldp x9, x10, [x29, #32] +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: stp x9, x10, [sp, #-16]! +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: bl many_args_private_za_callee +; CHECK-SDAG-NEXT: add sp, sp, #16 +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_many_callee_arguments: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -303,8 +328,8 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: sub sp, sp, x0 -; CHECK-NEXT: ldp x9, x10, [x29, #32] ; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: ldp x9, x10, [x29, #32] ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: stp x9, x10, [sp, #-16]! @@ -319,31 +344,6 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: mov x8, x0 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size -; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32] -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]! -; CHECK-NEWLOWERING-NEXT: mov x0, x8 -; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee -; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 ) nounwind "aarch64_za_state_agnostic" { %ret = call i64 @many_args_private_za_callee( @@ -352,6 +352,34 @@ define i64 @test_many_callee_arguments( } define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ +; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: mov x8, sp +; CHECK-SDAG-NEXT: sub x19, x8, x0 +; CHECK-SDAG-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-SDAG-NEXT: cmp sp, x19 +; CHECK-SDAG-NEXT: b.le .LBB7_3 +; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-SDAG-NEXT: str xzr, [sp] +; CHECK-SDAG-NEXT: b .LBB7_1 +; CHECK-SDAG-NEXT: .LBB7_3: +; CHECK-SDAG-NEXT: mov sp, x19 +; CHECK-SDAG-NEXT: ldr xzr, [sp] +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl private_za +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -360,6 +388,8 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: sub x19, x8, x0 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEXT: cmp sp, x19 @@ -370,8 +400,6 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: mov sp, x19 ; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: bl private_za ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore @@ -379,34 +407,6 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: agnostic_za_buffer_alloc_with_stack_probes: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size -; CHECK-NEWLOWERING-NEXT: mov x8, sp -; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEWLOWERING-NEXT: cmp sp, x19 -; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3 -; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-NEWLOWERING-NEXT: str xzr, [sp] -; CHECK-NEWLOWERING-NEXT: b .LBB7_1 -; CHECK-NEWLOWERING-NEXT: .LBB7_3: -; CHECK-NEWLOWERING-NEXT: mov sp, x19 -; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] -; CHECK-NEWLOWERING-NEXT: bl private_za -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void @private_za() ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 57025ea172097..b5974f5407c73 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -213,19 +213,18 @@ declare double @za_shared_callee(double) "aarch64_inout_za" define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: -; CHECK-COMMON: // %bb.0: // %prelude +; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 -; CHECK-COMMON-NEXT: b .LBB6_1 -; CHECK-COMMON-NEXT: .LBB6_1: // %save.za +; CHECK-COMMON-NEXT: cbnz x8, .LBB6_1 +; CHECK-COMMON-NEXT: b .LBB6_2 +; CHECK-COMMON-NEXT: .LBB6_1: // %entry ; CHECK-COMMON-NEXT: bl __arm_tpidr2_save ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: zero {za} ; CHECK-COMMON-NEXT: b .LBB6_2 ; CHECK-COMMON-NEXT: .LBB6_2: // %entry ; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: zero {za} ; CHECK-COMMON-NEXT: bl za_shared_callee ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d1, x8 @@ -254,6 +253,9 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 ; CHECK-COMMON-NEXT: bl normal_callee +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-COMMON-NEXT: fmov d1, x8 +; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: sub x0, x29, #16 @@ -264,9 +266,6 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: b .LBB7_2 ; CHECK-COMMON-NEXT: .LBB7_2: // %entry ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr -; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 -; CHECK-COMMON-NEXT: fmov d1, x8 -; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: mov sp, x29 ; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -441,18 +440,18 @@ declare double @zt0_shared_callee(double) "aarch64_inout_zt0" define double @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" { ; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee: -; CHECK-COMMON: // %bb.0: // %prelude +; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-COMMON-NEXT: cbz x8, .LBB13_2 -; CHECK-COMMON-NEXT: b .LBB13_1 -; CHECK-COMMON-NEXT: .LBB13_1: // %save.za +; CHECK-COMMON-NEXT: cbnz x8, .LBB13_1 +; CHECK-COMMON-NEXT: b .LBB13_2 +; CHECK-COMMON-NEXT: .LBB13_1: // %entry ; CHECK-COMMON-NEXT: bl __arm_tpidr2_save ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: zero { zt0 } ; CHECK-COMMON-NEXT: b .LBB13_2 ; CHECK-COMMON-NEXT: .LBB13_2: // %entry ; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: zero { zt0 } ; CHECK-COMMON-NEXT: bl zt0_shared_callee ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d1, x8 @@ -470,17 +469,18 @@ define double @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline ; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee: ; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: sub sp, sp, #80 -; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: mov x19, sp -; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Spill +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: str zt0, [x8] ; CHECK-COMMON-NEXT: smstop za ; CHECK-COMMON-NEXT: bl normal_callee -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x19] ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 -; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: ldr zt0, [x8] +; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-COMMON-NEXT: add sp, sp, #80 ; CHECK-COMMON-NEXT: ret entry: @@ -511,4 +511,3 @@ define void @agnostic_za_function(ptr %ptr) nounwind "aarch64_za_state_agnostic" call void %ptr() ret void } - diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 99c65b090adb0..28050960c1da4 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -10,506 +10,168 @@ declare void @llvm.trap() #0 define void @quux() #1 { ; CHECK-LABEL: quux: -; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #384 -; CHECK-NEXT: .cfi_def_cfa w29, 96 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w26, -64 -; CHECK-NEXT: .cfi_offset w27, -72 -; CHECK-NEXT: .cfi_offset w28, -80 -; CHECK-NEXT: .cfi_offset w30, -88 -; CHECK-NEXT: .cfi_offset w29, -96 -; CHECK-NEXT: rdsvl x8, #1 +; CHECK: // %bb.0: // %bb +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #352 +; CHECK-NEXT: addvl sp, sp, #-21 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x02, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x01, 0x1e, 0x22 // sp + 368 + 168 * VG +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB0_2 -; CHECK-NEXT: b .LBB0_1 -; CHECK-NEXT: .LBB0_1: // %save.za +; CHECK-NEXT: cbnz x8, .LBB0_1 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_1: // %bb ; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_2: // %bb ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero {za} -; CHECK-NEXT: mov w9, #15 // =0xf -; CHECK-NEXT: // implicit-def: $x8 -; CHECK-NEXT: mov w8, w9 -; CHECK-NEXT: mov x9, x8 -; CHECK-NEXT: incd x9 -; CHECK-NEXT: mov w0, w9 -; CHECK-NEXT: // implicit-def: $x9 -; CHECK-NEXT: mov w9, w0 -; CHECK-NEXT: and x14, x9, #0x70 -; CHECK-NEXT: sub x9, x29, #120 -; CHECK-NEXT: stur x14, [x9, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #112 -; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #104 -; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, x8 -; CHECK-NEXT: incb x9 -; CHECK-NEXT: mov w0, w9 -; CHECK-NEXT: // implicit-def: $x9 -; CHECK-NEXT: mov w9, w0 -; CHECK-NEXT: and x10, x9, #0x3f0 -; CHECK-NEXT: sub x9, x29, #96 -; CHECK-NEXT: stur x10, [x9, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #88 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #80 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #72 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #64 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #56 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #48 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #40 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #32 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #24 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #16 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x11, x29, #8 -; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-256] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-248] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x10 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-240] // 8-byte Folded Spill -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, x14 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x9, x9, #16 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x9, x8 -; CHECK-NEXT: incb x9, all, mul #2 -; CHECK-NEXT: mov w0, w9 -; CHECK-NEXT: // implicit-def: $x9 -; CHECK-NEXT: mov w9, w0 -; CHECK-NEXT: and x9, x9, #0x7f0 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: subs x10, x10, x9 -; CHECK-NEXT: and x10, x10, #0xffffffffffffffe0 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: mov x2, sp -; CHECK-NEXT: subs x10, x2, #16 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-232] // 8-byte Folded Spill -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: subs x11, x10, x14 -; CHECK-NEXT: mov sp, x11 -; CHECK-NEXT: mov x10, x11 -; CHECK-NEXT: stur x10, [x29, #-224] // 8-byte Folded Spill -; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: subs x10, x0, #16 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-216] // 8-byte Folded Spill -; CHECK-NEXT: mov x17, sp -; CHECK-NEXT: subs x10, x17, #16 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-208] // 8-byte Folded Spill -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: subs x10, x10, x14 -; CHECK-NEXT: stur x10, [x29, #-32] // 8-byte Folded Spill -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-200] // 8-byte Folded Spill -; CHECK-NEXT: mov x15, sp -; CHECK-NEXT: subs x10, x15, #16 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-192] // 8-byte Folded Spill -; CHECK-NEXT: mov x13, sp -; CHECK-NEXT: subs x10, x13, #16 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: stur x10, [x29, #-184] // 8-byte Folded Spill -; CHECK-NEXT: incw x8 -; CHECK-NEXT: mov w1, w8 -; CHECK-NEXT: // implicit-def: $x8 -; CHECK-NEXT: mov w8, w1 -; CHECK-NEXT: and x12, x8, #0xf0 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x10, x8, x12 -; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: mov x8, x10 -; CHECK-NEXT: stur x8, [x29, #-176] // 8-byte Folded Spill -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x8, x8, x12 -; CHECK-NEXT: stur x8, [x29, #-24] // 8-byte Folded Spill -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-168] // 8-byte Folded Spill -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-160] // 8-byte Folded Spill -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-152] // 8-byte Folded Spill -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-56] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-48] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x24, sp -; CHECK-NEXT: subs x8, x24, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x7, sp -; CHECK-NEXT: subs x8, x7, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x27, sp -; CHECK-NEXT: subs x8, x27, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x26, sp -; CHECK-NEXT: subs x8, x26, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x1, sp -; CHECK-NEXT: subs x8, x1, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: subs x8, x9, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x20, sp -; CHECK-NEXT: subs x8, x20, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x16, sp -; CHECK-NEXT: subs x8, x16, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-144] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x5, sp -; CHECK-NEXT: subs x8, x5, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: subs x8, x12, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x22, sp -; CHECK-NEXT: subs x8, x22, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x25, sp -; CHECK-NEXT: subs x8, x25, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x30, sp -; CHECK-NEXT: subs x8, x30, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-96] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-64] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-128] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-136] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-120] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-80] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-112] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-88] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x6, sp -; CHECK-NEXT: subs x8, x6, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x21, sp -; CHECK-NEXT: subs x8, x21, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stur x8, [x29, #-40] // 8-byte Folded Spill -; CHECK-NEXT: subs x8, x8, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x28, sp -; CHECK-NEXT: subs x8, x28, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x4, x8, x14 -; CHECK-NEXT: mov sp, x4 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x3, x8, x14 -; CHECK-NEXT: mov sp, x3 -; CHECK-NEXT: mov x23, sp -; CHECK-NEXT: subs x8, x23, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x18, sp -; CHECK-NEXT: subs x8, x18, #16 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov x14, sp -; CHECK-NEXT: subs x8, x14, #16 -; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: sturb w8, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-144] // 8-byte Folded Reload -; CHECK-NEXT: sturb w8, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload -; CHECK-NEXT: sturb w8, [x30, #-16] +; CHECK-NEXT: strb w8, [sp, #207] +; CHECK-NEXT: strb w8, [sp, #183] +; CHECK-NEXT: strb w8, [sp, #143] ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: stur x8, [x29, #-16] // 8-byte Folded Spill -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x20, #-16] -; CHECK-NEXT: ldur x9, [x27, #-16] -; CHECK-NEXT: add x30, x8, x9, lsl #2 -; CHECK-NEXT: ldur x8, [x1, #-16] -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ldur x9, [x16, #-16] -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload -; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldur x8, [x29, #-96] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x5, #-16] -; CHECK-NEXT: ldur x9, [x26, #-16] -; CHECK-NEXT: add x30, x8, x9, lsl #2 -; CHECK-NEXT: ldur x8, [x1, #-16] -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ldur x9, [x12, #-16] -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldur x9, [x29, #-128] // 8-byte Folded Reload -; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldur x8, [x29, #-64] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x22, #-16] -; CHECK-NEXT: ldur x9, [x27, #-16] -; CHECK-NEXT: add x30, x8, x9, lsl #2 -; CHECK-NEXT: ldur x8, [x26, #-16] -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ldur x9, [x25, #-16] -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldur x9, [x29, #-136] // 8-byte Folded Reload -; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldur x8, [x29, #-128] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-120] // 8-byte Folded Reload -; CHECK-NEXT: mov w30, #32 // =0x20 -; CHECK-NEXT: // kill: def $lr killed $w30 -; CHECK-NEXT: stur x30, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-80] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x1, #-16] -; CHECK-NEXT: lsl x8, x8, #5 -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-112] // 8-byte Folded Reload -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x16, #-16] -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x27, #-16] -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: lsr x8, x8, #5 -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x20, #-16] -; CHECK-NEXT: stur x8, [x29, #-104] // 8-byte Folded Spill -; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload -; CHECK-NEXT: ldur x9, [x9, #-16] -; CHECK-NEXT: ldur x8, [x8, #-16] -; CHECK-NEXT: mul x9, x9, x8 -; CHECK-NEXT: ldur x8, [x29, #-104] // 8-byte Folded Reload -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-88] // 8-byte Folded Reload -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x12, #-16] -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x26, #-16] -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: lsr x8, x8, #5 -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x5, #-16] -; CHECK-NEXT: stur x8, [x29, #-72] // 8-byte Folded Spill -; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload -; CHECK-NEXT: ldur x9, [x9, #-16] -; CHECK-NEXT: ldur x8, [x8, #-16] -; CHECK-NEXT: mul x9, x9, x8 -; CHECK-NEXT: ldur x8, [x29, #-72] // 8-byte Folded Reload -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x9, [x29, #-40] // 8-byte Folded Reload -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x6, #-16] -; CHECK-NEXT: stur x8, [x6, #-16] -; CHECK-NEXT: stur x8, [x21, #-16] -; CHECK-NEXT: stur x8, [x21, #-16] -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x27, #-16] -; CHECK-NEXT: ldur x9, [x21, #-16] -; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: ldur x9, [x29, #-56] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldur x8, [x29, #-48] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-40] // 8-byte Folded Reload -; CHECK-NEXT: ldur x9, [x9, #-16] -; CHECK-NEXT: stur x9, [x8, #-16] -; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x28, #-16] -; CHECK-NEXT: ldur x8, [x26, #-16] -; CHECK-NEXT: ldur x9, [x6, #-16] -; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: ldur x9, [x29, #-32] // 8-byte Folded Reload -; CHECK-NEXT: stur x8, [x24, #-16] -; CHECK-NEXT: ldur x8, [x29, #-24] // 8-byte Folded Reload -; CHECK-NEXT: stur x30, [x7, #-16] -; CHECK-NEXT: ldur x7, [x29, #-16] // 8-byte Folded Reload -; CHECK-NEXT: ldur x24, [x24, #-16] -; CHECK-NEXT: stur x24, [x28, #-16] -; CHECK-NEXT: ldur x24, [x21, #-16] -; CHECK-NEXT: ldur x27, [x27, #-16] -; CHECK-NEXT: whilelt pn8.s, x24, x27, vlx2 -; CHECK-NEXT: str pn8, [x4] -; CHECK-NEXT: ldur x24, [x6, #-16] -; CHECK-NEXT: ldur x26, [x26, #-16] -; CHECK-NEXT: whilelt pn8.s, x24, x26, vlx2 -; CHECK-NEXT: str pn8, [x3] -; CHECK-NEXT: stur x7, [x23, #-16] -; CHECK-NEXT: ldur x22, [x22, #-16] -; CHECK-NEXT: ldur x24, [x21, #-16] -; CHECK-NEXT: add x22, x22, x24, lsl #2 -; CHECK-NEXT: ldur x24, [x6, #-16] -; CHECK-NEXT: ldur x25, [x25, #-16] -; CHECK-NEXT: mul x24, x24, x25 -; CHECK-NEXT: add x22, x22, x24, lsl #2 -; CHECK-NEXT: stur x22, [x23, #-16] +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Spill +; CHECK-NEXT: str x8, [sp, #128] +; CHECK-NEXT: ldr x9, [sp, #192] +; CHECK-NEXT: ldr x10, [sp, #224] +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: ldr x10, [sp, #208] +; CHECK-NEXT: subs x10, x10, #1 +; CHECK-NEXT: ldr x11, [sp, #184] +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: str x9, [sp, #128] +; CHECK-NEXT: str x8, [sp, #120] +; CHECK-NEXT: ldr x9, [sp, #168] +; CHECK-NEXT: ldr x10, [sp, #216] +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: ldr x10, [sp, #208] +; CHECK-NEXT: subs x10, x10, #1 +; CHECK-NEXT: ldr x11, [sp, #160] +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: str x9, [sp, #120] +; CHECK-NEXT: str x8, [sp, #112] +; CHECK-NEXT: ldr x9, [sp, #152] +; CHECK-NEXT: ldr x10, [sp, #224] +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: ldr x10, [sp, #216] +; CHECK-NEXT: subs x10, x10, #1 +; CHECK-NEXT: ldr x11, [sp, #144] +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: str x9, [sp, #112] +; CHECK-NEXT: mov w9, #32 // =0x20 +; CHECK-NEXT: // kill: def $x9 killed $w9 +; CHECK-NEXT: str x9, [sp, #104] +; CHECK-NEXT: str x9, [sp, #96] +; CHECK-NEXT: str x8, [sp, #88] +; CHECK-NEXT: ldr x10, [sp, #208] +; CHECK-NEXT: lsl x10, x10, #5 +; CHECK-NEXT: str x10, [sp, #88] +; CHECK-NEXT: str x9, [sp, #184] +; CHECK-NEXT: str x8, [sp, #80] +; CHECK-NEXT: ldr x10, [sp, #224] +; CHECK-NEXT: subs x10, x10, #1 +; CHECK-NEXT: lsr x10, x10, #5 +; CHECK-NEXT: add x10, x10, #1 +; CHECK-NEXT: str x10, [sp, #80] +; CHECK-NEXT: ldr x10, [sp, #192] +; CHECK-NEXT: ldr x11, [sp, #80] +; CHECK-NEXT: ldr x12, [sp, #88] +; CHECK-NEXT: mul x11, x11, x12 +; CHECK-NEXT: add x10, x10, x11, lsl #2 +; CHECK-NEXT: str x10, [sp, #128] +; CHECK-NEXT: str x9, [sp, #160] +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: ldr x10, [sp, #216] +; CHECK-NEXT: subs x10, x10, #1 +; CHECK-NEXT: lsr x10, x10, #5 +; CHECK-NEXT: add x10, x10, #1 +; CHECK-NEXT: str x10, [sp, #72] +; CHECK-NEXT: ldr x10, [sp, #168] +; CHECK-NEXT: ldr x11, [sp, #72] +; CHECK-NEXT: ldr x12, [sp, #88] +; CHECK-NEXT: mul x11, x11, x12 +; CHECK-NEXT: add x10, x10, x11, lsl #2 +; CHECK-NEXT: str x10, [sp, #120] +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: str x8, [sp, #56] +; CHECK-NEXT: str x8, [sp, #56] +; CHECK-NEXT: str x8, [sp, #48] +; CHECK-NEXT: ldr x10, [sp, #224] +; CHECK-NEXT: ldr x11, [sp, #56] +; CHECK-NEXT: subs x10, x10, x11 +; CHECK-NEXT: str x10, [sp, #256] +; CHECK-NEXT: str x9, [sp, #248] +; CHECK-NEXT: ldr x10, [sp, #256] +; CHECK-NEXT: str x10, [sp, #48] +; CHECK-NEXT: str x8, [sp, #40] +; CHECK-NEXT: ldr x10, [sp, #216] +; CHECK-NEXT: ldr x11, [sp, #64] +; CHECK-NEXT: subs x10, x10, x11 +; CHECK-NEXT: str x10, [sp, #240] +; CHECK-NEXT: str x9, [sp, #232] +; CHECK-NEXT: ldr x9, [sp, #240] +; CHECK-NEXT: str x9, [sp, #40] +; CHECK-NEXT: ldr x9, [sp, #56] +; CHECK-NEXT: ldr x10, [sp, #224] +; CHECK-NEXT: whilelt pn8.s, x9, x10, vlx2 +; CHECK-NEXT: add x9, sp, #352 +; CHECK-NEXT: str pn8, [x9, #7, mul vl] +; CHECK-NEXT: ldr x9, [sp, #64] +; CHECK-NEXT: ldr x10, [sp, #216] +; CHECK-NEXT: whilelt pn8.s, x9, x10, vlx2 +; CHECK-NEXT: add x9, sp, #352 +; CHECK-NEXT: str pn8, [x9, #6, mul vl] +; CHECK-NEXT: str x8, [sp, #32] +; CHECK-NEXT: ldr x9, [sp, #152] +; CHECK-NEXT: ldr x10, [sp, #56] +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: ldr x10, [sp, #64] +; CHECK-NEXT: ldr x11, [sp, #144] +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: str x9, [sp, #32] ; CHECK-NEXT: zero {za} -; CHECK-NEXT: stur x7, [x18, #-16] -; CHECK-NEXT: ldur x20, [x20, #-16] -; CHECK-NEXT: ldur x21, [x21, #-16] -; CHECK-NEXT: ldur x22, [x1, #-16] -; CHECK-NEXT: mul x21, x21, x22 -; CHECK-NEXT: add x20, x20, x21, lsl #2 -; CHECK-NEXT: stur x20, [x18, #-16] -; CHECK-NEXT: stur x7, [x14, #-16] -; CHECK-NEXT: ldur x5, [x5, #-16] -; CHECK-NEXT: ldur x6, [x6, #-16] -; CHECK-NEXT: ldur x7, [x1, #-16] -; CHECK-NEXT: mul x6, x6, x7 -; CHECK-NEXT: add x5, x5, x6, lsl #2 -; CHECK-NEXT: stur x5, [x14, #-16] -; CHECK-NEXT: ldur x1, [x1, #-16] -; CHECK-NEXT: ldr p1, [x4] -; CHECK-NEXT: ldur x18, [x18, #-16] -; CHECK-NEXT: ldur x16, [x16, #-16] -; CHECK-NEXT: lsr x16, x16, #2 -; CHECK-NEXT: ldr p0, [x3] -; CHECK-NEXT: ldur x14, [x14, #-16] -; CHECK-NEXT: ldur x12, [x12, #-16] -; CHECK-NEXT: lsr x12, x12, #2 -; CHECK-NEXT: stur x1, [x2, #-16] -; CHECK-NEXT: str p1, [x11] -; CHECK-NEXT: stur x18, [x0, #-16] -; CHECK-NEXT: stur x16, [x17, #-16] -; CHECK-NEXT: str p0, [x9] -; CHECK-NEXT: stur x14, [x15, #-16] -; CHECK-NEXT: stur x12, [x13, #-16] -; CHECK-NEXT: ldr p0, [x11] +; CHECK-NEXT: str x8, [sp, #24] +; CHECK-NEXT: ldr x9, [sp, #192] +; CHECK-NEXT: ldr x10, [sp, #56] +; CHECK-NEXT: ldr x11, [sp, #208] +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: add x9, x9, x10, lsl #2 +; CHECK-NEXT: str x9, [sp, #24] +; CHECK-NEXT: str x8, [sp, #16] +; CHECK-NEXT: ldr x8, [sp, #168] +; CHECK-NEXT: ldr x9, [sp, #64] +; CHECK-NEXT: ldr x10, [sp, #208] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: str x8, [sp, #16] +; CHECK-NEXT: ldr x12, [sp, #208] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #7, mul vl] +; CHECK-NEXT: ldr x11, [sp, #24] +; CHECK-NEXT: ldr x8, [sp, #184] +; CHECK-NEXT: lsr x10, x8, #2 +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #6, mul vl] +; CHECK-NEXT: ldr x9, [sp, #16] +; CHECK-NEXT: ldr x8, [sp, #160] +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: str x12, [sp, #296] +; CHECK-NEXT: add x12, sp, #352 +; CHECK-NEXT: str p1, [x12, #47, mul vl] +; CHECK-NEXT: str x11, [sp, #288] +; CHECK-NEXT: str x10, [sp, #280] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: str p0, [x10, #46, mul vl] +; CHECK-NEXT: str x9, [sp, #272] +; CHECK-NEXT: str x8, [sp, #264] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #47, mul vl] ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: pext { p3.s, p4.s }, pn8[0] ; CHECK-NEXT: mov p0.b, p3.b @@ -517,146 +179,168 @@ define void @quux() #1 { ; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b ; CHECK-NEXT: mov p1.b, p4.b ; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b -; CHECK-NEXT: mov x11, x10 -; CHECK-NEXT: incd x11 -; CHECK-NEXT: str p1, [x11] -; CHECK-NEXT: str p0, [x10] -; CHECK-NEXT: ldr p0, [x9] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: addpl x8, x8, #31 +; CHECK-NEXT: addpl x8, x8, #13 +; CHECK-NEXT: incd x8 +; CHECK-NEXT: str p1, [x8] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p0, [x8, #44, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #46, mul vl] ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: pext { p3.s, p4.s }, pn8[0] ; CHECK-NEXT: mov p0.b, p3.b ; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b ; CHECK-NEXT: mov p1.b, p4.b ; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b -; CHECK-NEXT: mov x9, x8 -; CHECK-NEXT: incd x9 -; CHECK-NEXT: str p1, [x9] -; CHECK-NEXT: str p0, [x8] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: addpl x8, x8, #31 +; CHECK-NEXT: addpl x8, x8, #11 +; CHECK-NEXT: incd x8 +; CHECK-NEXT: str p1, [x8] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p0, [x8, #42, mul vl] ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_3: // %bb178 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldur x9, [x29, #-232] // 8-byte Folded Reload -; CHECK-NEXT: sub x8, x29, #80 -; CHECK-NEXT: ldur x8, [x8, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x10, x29, #88 -; CHECK-NEXT: ldur x10, [x10, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x11, x29, #104 -; CHECK-NEXT: ldur x11, [x11, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x12, x29, #112 -; CHECK-NEXT: ldur x12, [x12, #-256] // 8-byte Folded Reload -; CHECK-NEXT: ldur x13, [x29, #-152] // 8-byte Folded Reload -; CHECK-NEXT: ldur x14, [x29, #-160] // 8-byte Folded Reload -; CHECK-NEXT: sub x15, x29, #48 -; CHECK-NEXT: ldur x17, [x15, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x15, x29, #56 -; CHECK-NEXT: ldur x18, [x15, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x15, x29, #64 -; CHECK-NEXT: ldur x0, [x15, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x15, x29, #72 -; CHECK-NEXT: ldur x1, [x15, #-256] // 8-byte Folded Reload -; CHECK-NEXT: ldur x15, [x29, #-168] // 8-byte Folded Reload -; CHECK-NEXT: ldur x2, [x29, #-176] // 8-byte Folded Reload -; CHECK-NEXT: sub x16, x29, #16 -; CHECK-NEXT: ldur x3, [x16, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x16, x29, #24 -; CHECK-NEXT: ldur x4, [x16, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x16, x29, #32 -; CHECK-NEXT: ldur x5, [x16, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x16, x29, #40 -; CHECK-NEXT: ldur x6, [x16, #-256] // 8-byte Folded Reload -; CHECK-NEXT: ldur x16, [x29, #-240] // 8-byte Folded Reload -; CHECK-NEXT: ldur x7, [x29, #-248] // 8-byte Folded Reload -; CHECK-NEXT: ldur x20, [x29, #-256] // 8-byte Folded Reload -; CHECK-NEXT: sub x21, x29, #8 -; CHECK-NEXT: ldur x21, [x21, #-256] // 8-byte Folded Reload -; CHECK-NEXT: ldur x23, [x29, #-192] // 8-byte Folded Reload -; CHECK-NEXT: ldur x22, [x29, #-184] // 8-byte Folded Reload -; CHECK-NEXT: ldur x24, [x29, #-200] // 8-byte Folded Reload -; CHECK-NEXT: ldur x26, [x29, #-216] // 8-byte Folded Reload -; CHECK-NEXT: ldur x25, [x29, #-208] // 8-byte Folded Reload -; CHECK-NEXT: ldur x27, [x29, #-224] // 8-byte Folded Reload -; CHECK-NEXT: ldr p0, [x27] -; CHECK-NEXT: ldr x27, [x26] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #47, mul vl] +; CHECK-NEXT: ldr x8, [sp, #288] ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27] +; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x8] ; CHECK-NEXT: mov z0.d, z16.d ; CHECK-NEXT: mov z1.d, z24.d -; CHECK-NEXT: str z1, [x14, #1, mul vl] -; CHECK-NEXT: str z0, [x14] -; CHECK-NEXT: ldr x27, [x25] -; CHECK-NEXT: ldr x25, [x26] -; CHECK-NEXT: add x25, x25, x27, lsl #2 -; CHECK-NEXT: str x25, [x26] -; CHECK-NEXT: ldr p0, [x24] -; CHECK-NEXT: ldr x24, [x23] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z1, [x8, #4, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z0, [x8, #3, mul vl] +; CHECK-NEXT: ldr x9, [sp, #280] +; CHECK-NEXT: ldr x8, [sp, #288] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: str x8, [sp, #288] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #46, mul vl] +; CHECK-NEXT: ldr x8, [sp, #272] ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x24] +; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x8] ; CHECK-NEXT: mov z0.d, z16.d ; CHECK-NEXT: mov z1.d, z24.d -; CHECK-NEXT: str z1, [x13, #1, mul vl] -; CHECK-NEXT: str z0, [x13] -; CHECK-NEXT: ldr x24, [x22] -; CHECK-NEXT: ldr x22, [x23] -; CHECK-NEXT: add x22, x22, x24, lsl #2 -; CHECK-NEXT: str x22, [x23] -; CHECK-NEXT: ldr p1, [x2] -; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ldr z1, [x14] -; CHECK-NEXT: ldr z0, [x13] -; CHECK-NEXT: str p1, [x21] -; CHECK-NEXT: str p0, [x20] -; CHECK-NEXT: str z1, [x7] -; CHECK-NEXT: str z0, [x16] -; CHECK-NEXT: ldr p0, [x21] -; CHECK-NEXT: ldr p1, [x20] -; CHECK-NEXT: ldr z0, [x7] -; CHECK-NEXT: ldr z1, [x16] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z1, [x8, #2, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z0, [x8, #1, mul vl] +; CHECK-NEXT: ldr x9, [sp, #264] +; CHECK-NEXT: ldr x8, [sp, #272] +; CHECK-NEXT: add x8, x8, x9, lsl #2 +; CHECK-NEXT: str x8, [sp, #272] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #44, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #42, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #3, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p1, [x8, #95, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p0, [x8, #94, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z0, [x8, #9, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #95, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #94, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #10, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #9, mul vl] ; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s -; CHECK-NEXT: mov x16, x2 -; CHECK-NEXT: incd x16 -; CHECK-NEXT: ldr p1, [x16] -; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ldr z1, [x14, #1, mul vl] -; CHECK-NEXT: ldr z0, [x13] -; CHECK-NEXT: str p1, [x6] -; CHECK-NEXT: str p0, [x5] -; CHECK-NEXT: str z1, [x4] -; CHECK-NEXT: str z0, [x3] -; CHECK-NEXT: ldr p0, [x6] -; CHECK-NEXT: ldr p1, [x5] -; CHECK-NEXT: ldr z0, [x4] -; CHECK-NEXT: ldr z1, [x3] +; CHECK-NEXT: add x9, sp, #352 +; CHECK-NEXT: addpl x9, x9, #31 +; CHECK-NEXT: addpl x9, x9, #13 +; CHECK-NEXT: incd x9 +; CHECK-NEXT: ldr p1, [x9] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #42, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #4, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p1, [x8, #119, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p0, [x8, #118, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z1, [x8, #13, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z0, [x8, #12, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #119, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #118, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #13, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #12, mul vl] ; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.s, z1.s -; CHECK-NEXT: ldr p1, [x2] -; CHECK-NEXT: incd x15 -; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ldr z1, [x14] -; CHECK-NEXT: ldr z0, [x13, #1, mul vl] -; CHECK-NEXT: str p1, [x1] -; CHECK-NEXT: str p0, [x0] -; CHECK-NEXT: str z1, [x18] -; CHECK-NEXT: str z0, [x17] -; CHECK-NEXT: ldr p0, [x1] -; CHECK-NEXT: ldr p1, [x0] -; CHECK-NEXT: ldr z0, [x18] -; CHECK-NEXT: ldr z1, [x17] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #44, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: addpl x8, x8, #31 +; CHECK-NEXT: addpl x8, x8, #11 +; CHECK-NEXT: incd x8 +; CHECK-NEXT: ldr p0, [x8] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr z1, [x10, #3, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr z0, [x10, #2, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: str p1, [x10, #143, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: str p0, [x10, #142, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: str z1, [x10, #16, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: str z0, [x10, #15, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr p0, [x10, #143, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr p1, [x10, #142, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr z0, [x10, #16, mul vl] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: ldr z1, [x10, #15, mul vl] ; CHECK-NEXT: fmopa za2.s, p0/m, p1/m, z0.s, z1.s -; CHECK-NEXT: ldr p1, [x16] -; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ldr z1, [x14, #1, mul vl] -; CHECK-NEXT: ldr z0, [x13, #1, mul vl] -; CHECK-NEXT: str p1, [x12] -; CHECK-NEXT: str p0, [x11] -; CHECK-NEXT: str z1, [x10] -; CHECK-NEXT: str z0, [x8] -; CHECK-NEXT: ldr p0, [x12] -; CHECK-NEXT: ldr p1, [x11] -; CHECK-NEXT: ldr z0, [x10] -; CHECK-NEXT: ldr z1, [x8] +; CHECK-NEXT: ldr p1, [x9] +; CHECK-NEXT: ldr p0, [x8] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #4, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #2, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p1, [x8, #167, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str p0, [x8, #166, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z1, [x8, #19, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: str z0, [x8, #18, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p0, [x8, #167, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr p1, [x8, #166, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z0, [x8, #19, mul vl] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: ldr z1, [x8, #18, mul vl] ; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.s, z1.s -; CHECK-NEXT: ldr x8, [x9] +; CHECK-NEXT: ldr x8, [sp, #296] ; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: str x8, [sp, #296] ; CHECK-NEXT: b .LBB0_3 bb: %alloca = alloca , align 2 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index e672f777703a6..188059baa6675 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING declare void @private_za_callee() declare void @shared_za_callee() "aarch64_inout_za" @@ -42,19 +42,57 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; Test lazy-save mechanism for multiple callees. define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { +; CHECK-SDAG-LABEL: test_lazy_save_2_callees: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_4 +; CHECK-SDAG-NEXT: // %bb.3: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_4: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_lazy_save_2_callees: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -64,48 +102,10 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB1_4 -; CHECK-NEXT: // %bb.3: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB1_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void @private_za_callee() call void @private_za_callee() ret void @@ -145,6 +145,50 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; Test a combination of streaming-compatible -> normal call with lazy-save. define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" { +; CHECK-SDAG-LABEL: test_lazy_save_and_conditional_smstart: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: add x29, sp, #64 +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mrs x20, SVCR +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: sub x10, x29, #80 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-80] +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10 +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB3_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: .LBB3_2: +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: tbz w20, #0, .LBB3_4 +; CHECK-SDAG-NEXT: // %bb.3: +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: .LBB3_4: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #80 +; CHECK-SDAG-NEXT: cbnz x8, .LBB3_6 +; CHECK-SDAG-NEXT: // %bb.5: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB3_6: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: sub sp, x29, #64 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_lazy_save_and_conditional_smstart: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill @@ -157,12 +201,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #80 ; CHECK-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: mrs x20, SVCR +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -188,50 +232,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_and_conditional_smstart: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #80 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: .LBB3_2: -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: .LBB3_4: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_6 -; CHECK-NEWLOWERING-NEXT: // %bb.5: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB3_6: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void @private_za_callee() ret void } @@ -240,15 +240,67 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; restore from it (since ZA is off on return). We could improve this case ; by turning ZA off before the final private ZA call. define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" +; CHECK-SDAG-LABEL: test_lazy_save_mixed_shared_and_private_callees: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB4_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB4_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB4_4 +; CHECK-SDAG-NEXT: // %bb.3: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB4_4: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_callee +; CHECK-SDAG-NEXT: bl preserves_za_callee +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB4_6 +; CHECK-SDAG-NEXT: // %bb.5: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB4_6: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_lazy_save_mixed_shared_and_private_callees: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa w29, 32 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w19, -16 ; CHECK-NEXT: .cfi_offset w30, -24 ; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: rdsvl x8, #1 @@ -258,14 +310,14 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB4_2 -; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: sub x20, x29, #16 -; CHECK-NEXT: zero {za} -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -277,67 +329,15 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: bl preserves_za_callee -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB4_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB4_6: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_mixed_shared_and_private_callees: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB4_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero {za} -; CHECK-NEWLOWERING-NEXT: .LBB4_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB4_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB4_4: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: bl preserves_za_callee -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret { call void @private_za_callee() call void @shared_za_callee() @@ -347,15 +347,98 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" } define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { +; CHECK-SDAG-LABEL: test_many_back2back_private_za_calls: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: bl shared_za_callee +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_4 +; CHECK-SDAG-NEXT: // %bb.3: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_4: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_6 +; CHECK-SDAG-NEXT: // %bb.5: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_6: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_8 +; CHECK-SDAG-NEXT: // %bb.7: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_8: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_10 +; CHECK-SDAG-NEXT: // %bb.9: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_10: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_callee +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_12 +; CHECK-SDAG-NEXT: // %bb.11: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_12: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_callee +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_many_back2back_private_za_calls: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa w29, 32 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w19, -16 ; CHECK-NEXT: .cfi_offset w30, -24 ; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: rdsvl x8, #1 @@ -364,110 +447,27 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: sub x20, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_4 -; CHECK-NEXT: // %bb.3: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_4: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_6: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_8 -; CHECK-NEXT: // %bb.7: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_8: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_10 -; CHECK-NEXT: // %bb.9: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_10: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_12 -; CHECK-NEXT: // %bb.11: +; CHECK-NEXT: cbnz x8, .LBB5_2 +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_12: +; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_many_back2back_private_za_calls: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB5_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void @shared_za_callee() call void @private_za_callee() call void @private_za_callee() @@ -572,19 +572,51 @@ declare i64 @many_args_private_za_callee( ; stack pointer before the call -- in this test the lazy save should be setup ; before the stack decrement. define i64 @test_many_callee_arguments( +; CHECK-SDAG-LABEL: test_many_callee_arguments: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: mov x8, sp +; CHECK-SDAG-NEXT: rdsvl x9, #1 +; CHECK-SDAG-NEXT: msub x8, x9, x9, x8 +; CHECK-SDAG-NEXT: mov sp, x8 +; CHECK-SDAG-NEXT: ldp x10, x11, [x29, #32] +; CHECK-SDAG-NEXT: sub x12, x29, #16 +; CHECK-SDAG-NEXT: stp x8, x9, [x29, #-16] +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x12 +; CHECK-SDAG-NEXT: stp x10, x11, [sp, #-16]! +; CHECK-SDAG-NEXT: bl many_args_private_za_callee +; CHECK-SDAG-NEXT: add sp, sp, #16 +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB9_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB9_2: +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: test_many_callee_arguments: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: msub x8, x9, x9, x8 -; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: ldp x10, x11, [x29, #32] ; CHECK-NEXT: sub x12, x29, #16 -; CHECK-NEXT: stp x8, x9, [x29, #-16] +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: msr TPIDR2_EL0, x12 ; CHECK-NEXT: stp x10, x11, [sp, #-16]! ; CHECK-NEXT: bl many_args_private_za_callee @@ -603,38 +635,6 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: ldp x10, x11, [x29, #32] -; CHECK-NEWLOWERING-NEXT: sub x12, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x12 -; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]! -; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee -; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB9_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 ) nounwind "aarch64_inout_za" { %ret = call i64 @many_args_private_za_callee( diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll index 0717387ae2963..d2715b58439d8 100644 --- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll +++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll @@ -1,51 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefix=CHECK-SDAG ; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING declare void @shared_za_callee() "aarch64_inout_za" define void @private_za() "aarch64_new_za" { +; CHECK-SDAG-LABEL: private_za: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB0_2 +; CHECK-SDAG-NEXT: b .LBB0_1 +; CHECK-SDAG-NEXT: .LBB0_1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: mov x8, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: b .LBB0_2 +; CHECK-SDAG-NEXT: .LBB0_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: bl shared_za_callee +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: private_za: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB0_2 -; CHECK-NEXT: b .LBB0_1 -; CHECK-NEXT: .LBB0_1: // %save.za +; CHECK-NEXT: cbnz x8, .LBB0_1 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero {za} ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: private_za: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -16 -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_1 -; CHECK-NEWLOWERING-NEXT: b .LBB0_2 -; CHECK-NEWLOWERING-NEXT: .LBB0_1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero {za} -; CHECK-NEWLOWERING-NEXT: b .LBB0_2 -; CHECK-NEWLOWERING-NEXT: .LBB0_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void @shared_za_callee() ret void } @@ -53,29 +53,65 @@ define void @private_za() "aarch64_new_za" { ; Note: This test must run at -O0 as otherwise the multiple exits are optimized out. ; TODO: We should be able to omit the ZA save here (as this function does not use ZA). define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" { +; CHECK-SDAG-LABEL: private_za_multiple_exit: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: sub sp, sp, #32 +; CHECK-SDAG-NEXT: str x30, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SDAG-NEXT: .cfi_offset w30, -16 +; CHECK-SDAG-NEXT: str x2, [sp] // 8-byte Spill +; CHECK-SDAG-NEXT: str w1, [sp, #8] // 4-byte Spill +; CHECK-SDAG-NEXT: str w0, [sp, #12] // 4-byte Spill +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB1_2 +; CHECK-SDAG-NEXT: b .LBB1_1 +; CHECK-SDAG-NEXT: .LBB1_1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: mov x8, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: b .LBB1_2 +; CHECK-SDAG-NEXT: .LBB1_2: // %entry +; CHECK-SDAG-NEXT: ldr x8, [sp] // 8-byte Reload +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: subs x8, x8, #1 +; CHECK-SDAG-NEXT: b.ne .LBB1_4 +; CHECK-SDAG-NEXT: b .LBB1_3 +; CHECK-SDAG-NEXT: .LBB1_3: // %if.else +; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload +; CHECK-SDAG-NEXT: ldr w9, [sp, #8] // 4-byte Reload +; CHECK-SDAG-NEXT: add w0, w8, w9 +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: add sp, sp, #32 +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB1_4: // %if.end +; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload +; CHECK-SDAG-NEXT: ldr w9, [sp, #8] // 4-byte Reload +; CHECK-SDAG-NEXT: subs w0, w8, w9 +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: add sp, sp, #32 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: private_za_multiple_exit: -; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: str x2, [sp] // 8-byte Spill -; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill -; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill -; CHECK-NEXT: rdsvl x8, #1 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB1_2 -; CHECK-NEXT: b .LBB1_1 -; CHECK-NEXT: .LBB1_1: // %save.za +; CHECK-NEXT: cbnz x8, .LBB1_1 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_1: // %entry ; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} ; CHECK-NEXT: b .LBB1_2 ; CHECK-NEXT: .LBB1_2: // %entry -; CHECK-NEXT: ldr x8, [sp] // 8-byte Reload ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero {za} -; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill +; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill +; CHECK-NEXT: subs x8, x2, #1 ; CHECK-NEXT: b.ne .LBB1_4 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_3: // %if.else @@ -83,51 +119,15 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" ; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: smstop za -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_4: // %if.end ; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload ; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload ; CHECK-NEXT: subs w0, w8, w9 ; CHECK-NEXT: smstop za -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: private_za_multiple_exit: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_1 -; CHECK-NEWLOWERING-NEXT: b .LBB1_2 -; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %entry -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero {za} -; CHECK-NEWLOWERING-NEXT: b .LBB1_2 -; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %entry -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: str w1, [sp, #8] // 4-byte Spill -; CHECK-NEWLOWERING-NEXT: str w0, [sp, #12] // 4-byte Spill -; CHECK-NEWLOWERING-NEXT: subs x8, x2, #1 -; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_4 -; CHECK-NEWLOWERING-NEXT: b .LBB1_3 -; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %if.else -; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Reload -; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Reload -; CHECK-NEWLOWERING-NEXT: add w0, w8, w9 -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: ret -; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %if.end -; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Reload -; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Reload -; CHECK-NEWLOWERING-NEXT: subs w0, w8, w9 -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: ret entry: %tobool = icmp eq i64 %cond, 1 br i1 %tobool, label %if.else, label %if.end @@ -143,36 +143,36 @@ if.end: ; In simple cases like this we should omit all ZA setup. define i32 @private_za_trivially_does_not_use_za(i32 %x) "aarch64_new_za" { +; CHECK-SDAG-LABEL: private_za_trivially_does_not_use_za: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: sub sp, sp, #32 +; CHECK-SDAG-NEXT: str x30, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SDAG-NEXT: .cfi_offset w30, -16 +; CHECK-SDAG-NEXT: str w0, [sp, #12] // 4-byte Spill +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB2_2 +; CHECK-SDAG-NEXT: b .LBB2_1 +; CHECK-SDAG-NEXT: .LBB2_1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: mov x8, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: b .LBB2_2 +; CHECK-SDAG-NEXT: .LBB2_2: +; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: add w0, w8, w8 +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: add sp, sp, #32 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: private_za_trivially_does_not_use_za: -; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB2_2 -; CHECK-NEXT: b .LBB2_1 -; CHECK-NEXT: .LBB2_1: // %save.za -; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: b .LBB2_2 -; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload -; CHECK-NEXT: smstart za -; CHECK-NEXT: zero {za} -; CHECK-NEXT: add w0, w8, w8 -; CHECK-NEXT: smstop za -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK: // %bb.0: +; CHECK-NEXT: add w0, w0, w0 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: private_za_trivially_does_not_use_za: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: add w0, w0, w0 -; CHECK-NEWLOWERING-NEXT: ret %ret = add i32 %x, %x ret i32 %ret } diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index 240b204d15210..50449172ce85b 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -1,86 +1,86 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING declare void @private_za_call() declare void @shared_za_call() "aarch64_inout_za" define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: private_za_loop: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: cmp w0, #1 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: b.lt .LBB0_5 +; CHECK-SDAG-NEXT: // %bb.1: // %loop.preheader +; CHECK-SDAG-NEXT: mov w19, w0 +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: b .LBB0_3 +; CHECK-SDAG-NEXT: .LBB0_2: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-SDAG-NEXT: subs w19, w19, #1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b.eq .LBB0_5 +; CHECK-SDAG-NEXT: .LBB0_3: // %loop +; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_2 +; CHECK-SDAG-NEXT: // %bb.4: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: b .LBB0_2 +; CHECK-SDAG-NEXT: .LBB0_5: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: private_za_loop: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: cmp w0, #1 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: b.lt .LBB0_5 +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %loop.preheader ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: sub x20, x29, #16 -; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %loop -; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: subs w19, w19, #1 -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: b.eq .LBB0_5 -; CHECK-NEXT: .LBB0_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: b.ne .LBB0_2 +; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEXT: // %bb.4: // %loop -; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: cbnz x8, .LBB0_5 +; CHECK-NEXT: // %bb.4: // %exit ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_5: // %exit +; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: private_za_loop: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: cmp w0, #1 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: b.lt .LBB0_3 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: mov w19, w0 -; CHECK-NEWLOWERING-NEXT: .LBB0_2: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1 -; CHECK-NEWLOWERING-NEXT: b.ne .LBB0_2 -; CHECK-NEWLOWERING-NEXT: .LBB0_3: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB0_5: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret entry: %cmpgt = icmp sgt i32 %n, 0 br i1 %cmpgt, label %loop, label %exit @@ -98,6 +98,47 @@ exit: ; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop. define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov w19, w0 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: cmp w19, #1 +; CHECK-SDAG-NEXT: b.lt .LBB1_5 +; CHECK-SDAG-NEXT: // %bb.1: // %loop.preheader +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: b .LBB1_3 +; CHECK-SDAG-NEXT: .LBB1_2: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-SDAG-NEXT: subs w19, w19, #1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b.eq .LBB1_5 +; CHECK-SDAG-NEXT: .LBB1_3: // %loop +; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2 +; CHECK-SDAG-NEXT: // %bb.4: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: b .LBB1_2 +; CHECK-SDAG-NEXT: .LBB1_5: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; ; CHECK-LABEL: private_za_loop_active_entry_and_exit: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -106,9 +147,9 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: cmp w19, #1 @@ -118,13 +159,13 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %loop ; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 -; CHECK-NEXT: subs w19, w19, #1 ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: b.eq .LBB1_5 +; CHECK-NEXT: cbz w19, .LBB1_5 ; CHECK-NEXT: .LBB1_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: sub w19, w19, #1 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 @@ -138,47 +179,6 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call -; -; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov w19, w0 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: cmp w19, #1 -; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 -; CHECK-NEWLOWERING-NEXT: b .LBB1_3 -; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5 -; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: b .LBB1_2 -; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: b shared_za_call entry: %cmpgt = icmp sgt i32 %n, 0 tail call void @shared_za_call() @@ -268,6 +268,45 @@ exit: } define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: mixed_shared_private_za_loop: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: b .LBB4_2 +; CHECK-SDAG-NEXT: .LBB4_1: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: ldrb w8, [x19] +; CHECK-SDAG-NEXT: tbz w8, #0, .LBB4_4 +; CHECK-SDAG-NEXT: .LBB4_2: // %loop +; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB4_1 +; CHECK-SDAG-NEXT: // %bb.3: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: b .LBB4_1 +; CHECK-SDAG-NEXT: .LBB4_4: // %exit +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: mixed_shared_private_za_loop: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -276,26 +315,26 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: b .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %loop ; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: ldrb w8, [x19] ; CHECK-NEXT: tbz w8, #0, .LBB4_4 ; CHECK-NEXT: .LBB4_2: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: ldrb w8, [x19] ; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB4_1 +; CHECK-NEXT: cbnz x9, .LBB4_1 ; CHECK-NEXT: // %bb.3: // %loop ; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: bl __arm_tpidr2_restore @@ -306,45 +345,6 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: mixed_shared_private_za_loop: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov x19, x0 -; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: b .LBB4_2 -; CHECK-NEWLOWERING-NEXT: .LBB4_1: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: tbz w8, #0, .LBB4_4 -; CHECK-NEWLOWERING-NEXT: .LBB4_2: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: ldrb w8, [x19] -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB4_1 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: b .LBB4_1 -; CHECK-NEWLOWERING-NEXT: .LBB4_4: // %exit -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret br label %loop loop: @@ -364,6 +364,49 @@ exit: define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: cond_clobber_followed_by_clobber: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov w19, w0 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: tbz w19, #0, .LBB5_4 +; CHECK-SDAG-NEXT: // %bb.1: // %cond_clobber +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_3 +; CHECK-SDAG-NEXT: // %bb.2: // %cond_clobber +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_3: // %cond_clobber +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB5_4: // %exit +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB5_6 +; CHECK-SDAG-NEXT: // %bb.5: // %exit +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB5_6: // %exit +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; ; CHECK-LABEL: cond_clobber_followed_by_clobber: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -372,73 +415,30 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: tbz w19, #0, .LBB5_4 -; CHECK-NEXT: // %bb.1: // %cond_clobber ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: tbz w19, #0, .LBB5_2 +; CHECK-NEXT: // %bb.1: // %cond_clobber ; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_3 -; CHECK-NEXT: // %bb.2: // %cond_clobber -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_3: // %cond_clobber -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB5_4: // %exit -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: .LBB5_2: // %exit ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_6 -; CHECK-NEXT: // %bb.5: // %exit +; CHECK-NEXT: cbnz x8, .LBB5_4 +; CHECK-NEXT: // %bb.3: // %exit ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB5_6: // %exit +; CHECK-NEXT: .LBB5_4: // %exit ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call -; -; CHECK-NEWLOWERING-LABEL: cond_clobber_followed_by_clobber: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov w19, w0 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB5_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %cond_clobber -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: .LBB5_2: // %exit -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB5_4: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: b shared_za_call tail call void @shared_za_call() br i1 %cond, label %cond_clobber, label %exit @@ -543,6 +543,48 @@ merge_shared: define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: diamond_mixed_za_merge_private: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: tbz w0, #0, .LBB8_2 +; CHECK-SDAG-NEXT: // %bb.1: // %then +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: b .LBB8_5 +; CHECK-SDAG-NEXT: .LBB8_2: // %else +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_4 +; CHECK-SDAG-NEXT: // %bb.3: // %else +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_4: // %else +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB8_5: // %merge_private_za +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_7 +; CHECK-SDAG-NEXT: // %bb.6: // %merge_private_za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_7: // %merge_private_za +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: diamond_mixed_za_merge_private: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -556,68 +598,26 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin ; CHECK-NEXT: tbz w0, #0, .LBB8_2 ; CHECK-NEXT: // %bb.1: // %then ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: b .LBB8_5 -; CHECK-NEXT: .LBB8_2: // %else ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB8_4 -; CHECK-NEXT: // %bb.3: // %else -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB8_4: // %else -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB8_5: // %merge_private_za +; CHECK-NEXT: b .LBB8_3 +; CHECK-NEXT: .LBB8_2: // %else ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: .LBB8_3: // %merge_private_za +; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB8_7 -; CHECK-NEXT: // %bb.6: // %merge_private_za +; CHECK-NEXT: cbnz x8, .LBB8_5 +; CHECK-NEXT: // %bb.4: // %merge_private_za ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB8_7: // %merge_private_za +; CHECK-NEXT: .LBB8_5: // %merge_private_za ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_private: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB8_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: b .LBB8_3 -; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %else -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: .LBB8_3: // %merge_private_za -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %merge_private_za -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB8_5: // %merge_private_za -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret entry: br i1 %cond, label %then, label %else @@ -635,6 +635,56 @@ merge_private_za: } define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: critical_edge_mixed_za: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov w19, w1 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: tbz w0, #0, .LBB9_5 +; CHECK-SDAG-NEXT: // %bb.1: // %shared_path +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: tbz w19, #0, .LBB9_8 +; CHECK-SDAG-NEXT: .LBB9_2: // %exit_private +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB9_4 +; CHECK-SDAG-NEXT: // %bb.3: // %exit_private +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB9_4: // %exit_private +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB9_9 +; CHECK-SDAG-NEXT: .LBB9_5: // %private_path +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB9_7 +; CHECK-SDAG-NEXT: // %bb.6: // %private_path +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB9_7: // %private_path +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: tbnz w19, #0, .LBB9_2 +; CHECK-SDAG-NEXT: .LBB9_8: // %exit_shared +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: .LBB9_9: // %common.ret +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: critical_edge_mixed_za: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -643,9 +693,9 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB9_5 ; CHECK-NEXT: // %bb.1: // %shared_path @@ -684,56 +734,6 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: critical_edge_mixed_za: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov w19, w1 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB9_5 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %shared_path -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB9_8 -; CHECK-NEWLOWERING-NEXT: .LBB9_2: // %exit_private -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit_private -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB9_4: // %exit_private -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: b .LBB9_9 -; CHECK-NEWLOWERING-NEXT: .LBB9_5: // %private_path -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_7 -; CHECK-NEWLOWERING-NEXT: // %bb.6: // %private_path -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB9_7: // %private_path -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB9_2 -; CHECK-NEWLOWERING-NEXT: .LBB9_8: // %exit_shared -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: .LBB9_9: // %common.ret -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret entry: br i1 %c1, label %shared_path, label %private_path @@ -836,6 +836,46 @@ exit: } define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { +; CHECK-SDAG-LABEL: loop_with_external_entry: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov w19, w1 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: tbz w0, #0, .LBB11_2 +; CHECK-SDAG-NEXT: // %bb.1: // %init +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: .LBB11_2: // %loop.preheader +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: b .LBB11_4 +; CHECK-SDAG-NEXT: .LBB11_3: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB11_4 Depth=1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: tbz w19, #0, .LBB11_6 +; CHECK-SDAG-NEXT: .LBB11_4: // %loop +; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl private_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB11_3 +; CHECK-SDAG-NEXT: // %bb.5: // %loop +; CHECK-SDAG-NEXT: // in Loop: Header=BB11_4 Depth=1 +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: b .LBB11_3 +; CHECK-SDAG-NEXT: .LBB11_6: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: loop_with_external_entry: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -844,9 +884,9 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB11_2 ; CHECK-NEXT: // %bb.1: // %init @@ -875,46 +915,6 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: loop_with_external_entry: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov w19, w1 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB11_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 -; CHECK-NEWLOWERING-NEXT: b .LBB11_4 -; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6 -; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3 -; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: b .LBB11_3 -; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret entry: br i1 %c1, label %init, label %loop diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index 3947127c47844..5243b8d7203d8 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG ; A simple EH test case that corresponds to the following C++ source: ; diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index afd56d198d0d3..d4840f77c5392 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING define i32 @no_tpidr2_save_required() "aarch64_inout_za" { ; CHECK-COMMON-LABEL: no_tpidr2_save_required: @@ -64,6 +64,51 @@ exit: } define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { +; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: str xzr, [sp, #-16]! +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -8 +; CHECK-SDAG-NEXT: .cfi_offset w29, -16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-SDAG-NEXT: cmp sp, x9 +; CHECK-SDAG-NEXT: b.le .LBB2_3 +; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-SDAG-NEXT: str xzr, [sp] +; CHECK-SDAG-NEXT: b .LBB2_1 +; CHECK-SDAG-NEXT: .LBB2_3: +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: ldr xzr, [sp] +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: cbz w0, .LBB2_5 +; CHECK-SDAG-NEXT: // %bb.4: // %use_b +; CHECK-SDAG-NEXT: fmov s1, #4.00000000 +; CHECK-SDAG-NEXT: fadd s0, s0, s1 +; CHECK-SDAG-NEXT: b .LBB2_8 +; CHECK-SDAG-NEXT: .LBB2_5: // %use_c +; CHECK-SDAG-NEXT: fmov s0, s1 +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: bl cosf +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_7 +; CHECK-SDAG-NEXT: // %bb.6: // %use_c +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_7: // %use_c +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB2_8: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -74,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEXT: cmp sp, x9 @@ -90,69 +137,22 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: // %bb.4: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 ; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: b .LBB2_8 +; CHECK-NEXT: b .LBB2_6 ; CHECK-NEXT: .LBB2_5: // %use_c ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf +; CHECK-NEXT: .LBB2_6: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB2_7 -; CHECK-NEXT: // %bb.6: // %use_c +; CHECK-NEXT: cbnz x8, .LBB2_8 +; CHECK-NEXT: // %bb.7: // %exit ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB2_7: // %use_c -; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB2_8: // %exit +; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required_stackprobe: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: str xzr, [sp, #-16]! -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEWLOWERING-NEXT: cmp sp, x9 -; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3 -; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 -; CHECK-NEWLOWERING-NEXT: str xzr, [sp] -; CHECK-NEWLOWERING-NEXT: b .LBB2_1 -; CHECK-NEWLOWERING-NEXT: .LBB2_3: -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b -; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 -; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 -; CHECK-NEWLOWERING-NEXT: b .LBB2_6 -; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c -; CHECK-NEWLOWERING-NEXT: fmov s0, s1 -; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8 -; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 0d4a39b2eeb2f..24b4565cf24b5 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -aarch64-new-sme-abi=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING ; ; Private-ZA Callee @@ -30,6 +30,36 @@ define void @zt0_in_caller_no_state_callee(ptr %callee) "aarch64_in_zt0" nounwin ; Expect setup and restore lazy-save around call ; Expect smstart za after call define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { +; CHECK-SDAG-LABEL: za_zt0_shared_caller_no_state_callee: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: sub x10, x29, #16 +; CHECK-SDAG-NEXT: sub x19, x29, #80 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10 +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: blr x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: za_zt0_shared_caller_no_state_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -40,55 +70,25 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: sub x19, x29, #64 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] ; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x0, x29, #80 ; CHECK-NEXT: cbnz x8, .LBB1_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: za_zt0_shared_caller_no_state_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x19, x29, #64 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEWLOWERING-NEXT: str zt0, [x19] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB1_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void %callee(); ret void; } @@ -167,48 +167,48 @@ define void @zt0_in_caller_zt0_new_callee(ptr %callee) "aarch64_in_zt0" nounwind ; Expect spill & fill of ZT0 around call ; Before return, expect smstop ZA define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwind { +; CHECK-SDAG-LABEL: zt0_new_caller_zt0_new_callee: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB6_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB6_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero { zt0 } +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: blr x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #80 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: zt0_new_caller_zt0_new_callee: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB6_2 -; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: .LBB6_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str zt0, [x8] ; CHECK-NEXT: smstop za ; CHECK-NEXT: blr x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: smstop za -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB6_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero { zt0 } -; CHECK-NEWLOWERING-NEXT: .LBB6_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x8, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x8] -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_new_zt0"; ret void; } @@ -219,46 +219,46 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; Expect spill & fill of ZT0 around __arm_sme_state call ; Before return, expect smstop ZA define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { +; CHECK-SDAG-LABEL: zt0_new_caller_abi_routine_callee: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB7_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB7_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero { zt0 } +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: bl __arm_sme_state +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #80 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: zt0_new_caller_abi_routine_callee: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 -; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str zt0, [x8] ; CHECK-NEXT: smstop za -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB7_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero { zt0 } -; CHECK-NEWLOWERING-NEXT: .LBB7_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x8, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x8] -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: ret %res = call {i64, i64} @__arm_sme_state() %res.0 = extractvalue {i64, i64} %res, 0 ret i64 %res.0 @@ -274,37 +274,37 @@ declare {i64, i64} @__arm_sme_state() ; Expect smstart ZA & clear ZT0 ; Before return, expect smstop ZA define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind { +; CHECK-SDAG-LABEL: zt0_new_caller: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB8_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB8_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero { zt0 } +; CHECK-SDAG-NEXT: blr x0 +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: zt0_new_caller: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB8_2 -; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: zt0_new_caller: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB8_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero { zt0 } -; CHECK-NEWLOWERING-NEXT: .LBB8_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_in_zt0"; ret void; } @@ -313,39 +313,39 @@ define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind { ; Expect smstart ZA, clear ZA & clear ZT0 ; Before return, expect smstop ZA define void @new_za_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_new_zt0" nounwind { +; CHECK-SDAG-LABEL: new_za_zt0_caller: +; CHECK-SDAG: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB9_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB9_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: zero { zt0 } +; CHECK-SDAG-NEXT: blr x0 +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: new_za_zt0_caller: -; CHECK: // %bb.0: // %prelude +; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB9_2 -; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: smstart za ; CHECK-NEXT: zero {za} ; CHECK-NEXT: zero { zt0 } +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: smstart za ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: new_za_zt0_caller: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB9_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: zero {za} -; CHECK-NEWLOWERING-NEXT: zero { zt0 } -; CHECK-NEWLOWERING-NEXT: .LBB9_2: -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } @@ -378,6 +378,38 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0" define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind { +; CHECK-SDAG-LABEL: zt0_multiple_private_za_calls: +; CHECK-SDAG: // %bb.0: +; CHECK-SDAG-NEXT: sub sp, sp, #96 +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x20, sp +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: str x30, [sp, #64] // 8-byte Spill +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: blr x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: blr x19 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: blr x19 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: blr x19 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldr x30, [sp, #64] // 8-byte Reload +; CHECK-SDAG-NEXT: add sp, sp, #96 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: zt0_multiple_private_za_calls: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 @@ -388,20 +420,8 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin ; CHECK-NEXT: str zt0, [x20] ; CHECK-NEXT: smstop za ; CHECK-NEXT: blr x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x20] -; CHECK-NEXT: str zt0, [x20] -; CHECK-NEXT: smstop za ; CHECK-NEXT: blr x19 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x20] -; CHECK-NEXT: str zt0, [x20] -; CHECK-NEXT: smstop za ; CHECK-NEXT: blr x19 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x20] -; CHECK-NEXT: str zt0, [x20] -; CHECK-NEXT: smstop za ; CHECK-NEXT: blr x19 ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x20] @@ -409,26 +429,6 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #96 -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x20, sp -; CHECK-NEWLOWERING-NEXT: mov x19, x0 -; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: str zt0, [x20] -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: blr x19 -; CHECK-NEWLOWERING-NEXT: blr x19 -; CHECK-NEWLOWERING-NEXT: blr x19 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x20] -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload -; CHECK-NEWLOWERING-NEXT: add sp, sp, #96 -; CHECK-NEWLOWERING-NEXT: ret call void %callee() call void %callee() call void %callee() diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index c1a42b568673a..000523de203fc 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -3238,9 +3238,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK0-NEXT: sub sp, sp, #16 ; CHECK0-NEXT: rdsvl x8, #1 ; CHECK0-NEXT: mov x9, sp -; CHECK0-NEXT: mov w20, w0 ; CHECK0-NEXT: msub x9, x8, x8, x9 ; CHECK0-NEXT: mov sp, x9 +; CHECK0-NEXT: mov w20, w0 ; CHECK0-NEXT: sub x10, x29, #80 ; CHECK0-NEXT: stp x9, x8, [x29, #-80] ; CHECK0-NEXT: msr TPIDR2_EL0, x10 @@ -3309,10 +3309,10 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: sub sp, sp, #80 ; CHECK64-NEXT: rdsvl x8, #1 ; CHECK64-NEXT: mov x9, sp -; CHECK64-NEXT: mov w20, w0 -; CHECK64-NEXT: msub x9, x8, x8, x9 ; CHECK64-NEXT: mov x19, sp +; CHECK64-NEXT: msub x9, x8, x8, x9 ; CHECK64-NEXT: mov sp, x9 +; CHECK64-NEXT: mov w20, w0 ; CHECK64-NEXT: add x10, x19, #0 ; CHECK64-NEXT: stp x9, x8, [x19] ; CHECK64-NEXT: msr TPIDR2_EL0, x10 @@ -3387,10 +3387,10 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: sub sp, sp, #1040 ; CHECK1024-NEXT: rdsvl x8, #1 ; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: mov w20, w0 -; CHECK1024-NEXT: msub x9, x8, x8, x9 ; CHECK1024-NEXT: mov x19, sp +; CHECK1024-NEXT: msub x9, x8, x8, x9 ; CHECK1024-NEXT: mov sp, x9 +; CHECK1024-NEXT: mov w20, w0 ; CHECK1024-NEXT: add x10, x19, #0 ; CHECK1024-NEXT: stp x9, x8, [x19] ; CHECK1024-NEXT: msr TPIDR2_EL0, x10 diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 37adfb89e4762..4dec5471e689c 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -aarch64-new-sme-abi=false | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT ; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64 @@ -524,6 +524,77 @@ declare ptr @memset(ptr, i32, i32) ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: VariableSized, Align: 16, Size: 0 define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" { +; CHECK-SDAG-LABEL: vastate: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 112 +; CHECK-SDAG-NEXT: cntd x9 +; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x9, [sp, #80] // 8-byte Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: add x29, sp, #64 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 48 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset vg, -32 +; CHECK-SDAG-NEXT: .cfi_offset w30, -40 +; CHECK-SDAG-NEXT: .cfi_offset w29, -48 +; CHECK-SDAG-NEXT: .cfi_offset b8, -56 +; CHECK-SDAG-NEXT: .cfi_offset b9, -64 +; CHECK-SDAG-NEXT: .cfi_offset b10, -72 +; CHECK-SDAG-NEXT: .cfi_offset b11, -80 +; CHECK-SDAG-NEXT: .cfi_offset b12, -88 +; CHECK-SDAG-NEXT: .cfi_offset b13, -96 +; CHECK-SDAG-NEXT: .cfi_offset b14, -104 +; CHECK-SDAG-NEXT: .cfi_offset b15, -112 +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov w20, w0 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: sub x10, x29, #80 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-80] +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10 +; CHECK-SDAG-NEXT: smstop sm +; CHECK-SDAG-NEXT: bl other +; CHECK-SDAG-NEXT: smstart sm +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #80 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_2 +; CHECK-SDAG-NEXT: // %bb.1: // %entry +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_2: // %entry +; CHECK-SDAG-NEXT: mov w0, w20 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: sub sp, x29, #64 +; CHECK-SDAG-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 0 +; CHECK-SDAG-NEXT: .cfi_restore w19 +; CHECK-SDAG-NEXT: .cfi_restore w20 +; CHECK-SDAG-NEXT: .cfi_restore vg +; CHECK-SDAG-NEXT: .cfi_restore w30 +; CHECK-SDAG-NEXT: .cfi_restore w29 +; CHECK-SDAG-NEXT: .cfi_restore b8 +; CHECK-SDAG-NEXT: .cfi_restore b9 +; CHECK-SDAG-NEXT: .cfi_restore b10 +; CHECK-SDAG-NEXT: .cfi_restore b11 +; CHECK-SDAG-NEXT: .cfi_restore b12 +; CHECK-SDAG-NEXT: .cfi_restore b13 +; CHECK-SDAG-NEXT: .cfi_restore b14 +; CHECK-SDAG-NEXT: .cfi_restore b15 +; CHECK-SDAG-NEXT: ret +; ; CHECK-LABEL: vastate: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill @@ -553,9 +624,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov w20, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov w20, w0 ; CHECK-NEXT: sub x10, x29, #80 ; CHECK-NEXT: stp x9, x8, [x29, #-80] ; CHECK-NEXT: msr TPIDR2_EL0, x10 @@ -594,77 +665,6 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: .cfi_restore b14 ; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: vastate: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 112 -; CHECK-NEWLOWERING-NEXT: cntd x9 -; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x9, [sp, #80] // 8-byte Spill -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 48 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -8 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w20, -16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset vg, -32 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -40 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -48 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b8, -56 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b9, -64 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b10, -72 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b11, -80 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b12, -88 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b13, -96 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b14, -104 -; CHECK-NEWLOWERING-NEXT: .cfi_offset b15, -112 -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: mov w20, w0 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: bl other -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry -; CHECK-NEWLOWERING-NEXT: mov w0, w20 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEWLOWERING-NEXT: .cfi_restore w19 -; CHECK-NEWLOWERING-NEXT: .cfi_restore w20 -; CHECK-NEWLOWERING-NEXT: .cfi_restore vg -; CHECK-NEWLOWERING-NEXT: .cfi_restore w30 -; CHECK-NEWLOWERING-NEXT: .cfi_restore w29 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b8 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b9 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b10 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b11 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b12 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b13 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b14 -; CHECK-NEWLOWERING-NEXT: .cfi_restore b15 -; CHECK-NEWLOWERING-NEXT: ret entry: tail call void @other() ret i32 %x