-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][SME] Allow SME peephole optimizations across SME pseudos #157655
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This allows folding `smstart/stops` in more cases.
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis allows folding Full diff: https://github.com/llvm/llvm-project/pull/157655.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 85cca1de47b78..ec70ddfb5fcf1 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -184,6 +184,11 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
isSVERegOp(TRI, MRI, MI.getOperand(1)))
Prev = nullptr;
break;
+ case AArch64::RestoreZAPseudo:
+ case AArch64::InOutZAUsePseudo:
+ case AArch64::CommitZASavePseudo:
+ case AArch64::SMEStateAllocPseudo:
+ case AArch64::RequiresZASavePseudo:
case AArch64::ADJCALLSTACKDOWN:
case AArch64::ADJCALLSTACKUP:
case AArch64::ANDXri:
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index a0a14f2ffae3f..e3007a3723484 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5:
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_6:
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
-; CHECK-NEWLOWERING-NEXT: // %bb.7:
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_8:
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 80827c2547780..442636cfc8398 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
declare void @callee()
declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -554,3 +554,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
store <vscale x 4 x float> %res1, ptr %ptr
ret void
}
+
+; normal caller -> streaming callees (with ZA state)
+define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test14:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #80
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #80
+; CHECK-NEXT: cbnz x8, .LBB15_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; normal caller -> streaming callees (with ZA agnostic state)
+define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
+; CHECK-LABEL: test15:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state_size
+; CHECK-NEXT: sub sp, sp, x0
+; CHECK-NEXT: mov x20, sp
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_save
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; locally streaming caller -> normal callees (with ZA state)
+define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
+; CHECK-LABEL: test16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB17_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee()
+ call void @callee()
+ ret void
+}
|
case AArch64::InOutZAUsePseudo: | ||
case AArch64::CommitZASavePseudo: | ||
case AArch64::SMEStateAllocPseudo: | ||
case AArch64::RequiresZASavePseudo: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I probably need to read through that pass, but the comment on top of that switch
says that we are searching for instructions that are "agnostic of streaming mode". If we get e.g. a InOutZAUsePseudo
, we will modify the ZA/SME state, but I guess it's fine given that there is a SMSTOP later on? I.e. the state will be discarded.
But what if we have a InOutZAUsePseudo
followed by RequiresZASavePseudo
? Wouldn't that change the ZA state more globally?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added an explicit check now, but I only intend to allow the smstart/stop
fold across pairs that only change the streaming mode, not ZA. These pseudos all operate independently of whether or not we're in streaming mode, but may require ZA state.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But what if we have a InOutZAUsePseudo followed by RequiresZASavePseudo? Wouldn't that change the ZA state more globally?
Note: With the order of passes (MachineSMEABI -> SMEPeepholeOpt) the marker nodes don't have any meaning at this stage (they're just leftovers), but I think this may matter for the Restore/CommitZASavePseudo
.
…lvm#157655) This allows folding `smstart/stops` in more cases.
This allows folding
smstart/stops
in more cases.