diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index b96f6f12a58d6..b3e1ddbb91f79 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -632,8 +632,8 @@ MachineSMEABI::findStateChangeInsertionPoint( PhysLiveRegs = Block.PhysLiveRegsAtExit; } - if (!(PhysLiveRegs & LiveRegs::NZCV)) - return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags). + if (PhysLiveRegs == LiveRegs::None) + return {InsertPt, PhysLiveRegs}; // Nothing to do (no live regs). // Find the previous state change. We can not move before this point. MachineBasicBlock::iterator PrevStateChangeI; @@ -650,15 +650,21 @@ MachineSMEABI::findStateChangeInsertionPoint( // Note: LiveUnits will only accurately track X0 and NZCV. LiveRegUnits LiveUnits(*TRI); setPhysLiveRegs(LiveUnits, PhysLiveRegs); + auto BestCandidate = std::make_pair(InsertPt, PhysLiveRegs); for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) { // Don't move before/into a call (which may have a state change before it). if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall()) break; LiveUnits.stepBackward(*I); - if (LiveUnits.available(AArch64::NZCV)) - return {I, getPhysLiveRegs(LiveUnits)}; + LiveRegs CurrentPhysLiveRegs = getPhysLiveRegs(LiveUnits); + // Find places where NZCV is available, but keep looking for locations where + // both NZCV and X0 are available, which can avoid some copies. + if (!(CurrentPhysLiveRegs & LiveRegs::NZCV)) + BestCandidate = {I, CurrentPhysLiveRegs}; + if (CurrentPhysLiveRegs == LiveRegs::None) + break; } - return {InsertPt, PhysLiveRegs}; + return BestCandidate; } void MachineSMEABI::insertStateChanges(EmitContext &Context, diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir index 3f174a62128a8..ed768dec77998 100644 --- a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir +++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir @@ -79,14 +79,12 @@ body: | ; CHECK-NEXT: RequiresZASavePseudo ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0 ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 ; CHECK-NEXT: MSR 56965, $xzr - ; CHECK-NEXT: $x0 = COPY [[COPY2]] + ; CHECK-NEXT: $x0 = IMPLICIT_DEF ; CHECK-NEXT: $nzcv = IMPLICIT_DEF ; CHECK-NEXT: FAKE_USE $x0 ; CHECK-NEXT: $zab0 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 30dbd1cb34667..0906e10b551b7 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -67,10 +67,10 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -170,11 +170,11 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x20 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -267,14 +267,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEWLOWERING-NEXT: // %bb.3: ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: .LBB5_4: -; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -336,10 +336,10 @@ define i64 @test_many_callee_arguments( ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee ; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll index 0c886c643c5fb..87a63fed0546c 100644 --- a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll +++ b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll @@ -87,8 +87,7 @@ define i32 @load_tls_shared_za() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .tlsdesccall x ; CHECK-NEXT: blr x1 ; CHECK-NEXT: mrs x8, TPIDR_EL0 -; CHECK-NEXT: ldr w0, [x8, x0] -; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ldr w8, [x8, x0] ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x9, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 @@ -133,8 +132,7 @@ define i32 @load_tls_streaming_shared_za() nounwind "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: blr x1 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mrs x8, TPIDR_EL0 -; CHECK-NEXT: ldr w0, [x8, x0] -; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ldr w8, [x8, x0] ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x9, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #80 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 50dd0c699284c..e672f777703a6 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -621,15 +621,15 @@ define i64 @test_many_callee_arguments( ; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]! ; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee ; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB9_2 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore ; CHECK-NEWLOWERING-NEXT: .LBB9_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 3aaae5e73ff23..37adfb89e4762 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -33,7 +33,7 @@ define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" ; CHECK-COMMON-NEXT: ldr x29, [sp, #8] // 8-byte Reload ; CHECK-COMMON-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ret -; CHECK-COMMON-NE +; CHECK-NE entry: %a = alloca %b = alloca i32 @@ -626,23 +626,21 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEWLOWERING-NEXT: mov x9, sp ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 ; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 ; CHECK-NEWLOWERING-NEXT: mov w20, w0 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: bl other ; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: mov w0, w20 -; CHECK-NEWLOWERING-NEXT: mov w8, w0 ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 -; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore ; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry -; CHECK-NEWLOWERING-NEXT: mov w0, w8 +; CHECK-NEWLOWERING-NEXT: mov w0, w20 ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112 @@ -671,4 +669,4 @@ entry: tail call void @other() ret i32 %x } -declare void @other() \ No newline at end of file +declare void @other()