From c8ce2df65fa49de5feda09073605a0ebc7fbb604 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 9 Sep 2025 11:13:57 +0000 Subject: [PATCH 1/2] [AArch64][SME] Allow SME peephole optimizations across SME pseudos This allows folding `smstart/stops` in more cases. --- llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp | 5 + llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 12 +- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 127 +++++++++++++++++- 3 files changed, 132 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index 85cca1de47b78..ec70ddfb5fcf1 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -184,6 +184,11 @@ bool SMEPeepholeOpt::optimizeStartStopPairs( isSVERegOp(TRI, MRI, MI.getOperand(1))) Prev = nullptr; break; + case AArch64::RestoreZAPseudo: + case AArch64::InOutZAUsePseudo: + case AArch64::CommitZASavePseudo: + case AArch64::SMEStateAllocPseudo: + case AArch64::RequiresZASavePseudo: case AArch64::ADJCALLSTACKDOWN: case AArch64::ADJCALLSTACKUP: case AArch64::ANDXri: diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index a0a14f2ffae3f..e3007a3723484 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: mov x8, x0 @@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEWLOWERING-NEXT: .LBB5_2: ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEWLOWERING-NEXT: // %bb.3: ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: .LBB5_4: -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6 -; CHECK-NEWLOWERING-NEXT: // %bb.5: -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: .LBB5_6: -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8 -; CHECK-NEWLOWERING-NEXT: // %bb.7: -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: .LBB5_8: ; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 80827c2547780..442636cfc8398 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s declare void @callee() declare void @callee_sm() "aarch64_pstate_sm_enabled" @@ -554,3 +554,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { store %res1, ptr %ptr ret void } + +; normal caller -> streaming callees (with ZA state) +define void @test14(ptr %callee) nounwind "aarch64_inout_za" { +; CHECK-LABEL: test14: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB15_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee_sm() + call void @callee_sm() + ret void +} + +; normal caller -> streaming callees (with ZA agnostic state) +define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" { +; CHECK-LABEL: test15: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee_sm() + call void @callee_sm() + ret void +} + +; locally streaming caller -> normal callees (with ZA state) +define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" { +; CHECK-LABEL: test16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB17_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} +; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: smstart za +; CHECK-NEXT: smstart sm +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + call void @callee() + ret void +} From 5fe510e7c35ea36cac231a04c13c0eb79f21e13a Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 15 Sep 2025 09:34:43 +0000 Subject: [PATCH 2/2] Add check --- llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index ec70ddfb5fcf1..2a563663a34d1 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -189,6 +189,12 @@ bool SMEPeepholeOpt::optimizeStartStopPairs( case AArch64::CommitZASavePseudo: case AArch64::SMEStateAllocPseudo: case AArch64::RequiresZASavePseudo: + // These instructions only depend on the ZA state, not the streaming mode, + // so if the pair of smstart/stop is only changing the streaming mode, we + // can permit these instructions. + if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM) + Prev = nullptr; + break; case AArch64::ADJCALLSTACKDOWN: case AArch64::ADJCALLSTACKUP: case AArch64::ANDXri: