Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,17 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
isSVERegOp(TRI, MRI, MI.getOperand(1)))
Prev = nullptr;
break;
case AArch64::RestoreZAPseudo:
case AArch64::InOutZAUsePseudo:
case AArch64::CommitZASavePseudo:
case AArch64::SMEStateAllocPseudo:
case AArch64::RequiresZASavePseudo:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I probably need to read through that pass, but the comment on top of that switch says that we are searching for instructions that are "agnostic of streaming mode". If we get e.g. a InOutZAUsePseudo, we will modify the ZA/SME state, but I guess it's fine given that there is a SMSTOP later on? I.e. the state will be discarded.

But what if we have a InOutZAUsePseudo followed by RequiresZASavePseudo? Wouldn't that change the ZA state more globally?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added an explicit check now, but I only intend to allow the smstart/stop fold across pairs that only change the streaming mode, not ZA. These pseudos all operate independently of whether or not we're in streaming mode, but may require ZA state.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But what if we have a InOutZAUsePseudo followed by RequiresZASavePseudo? Wouldn't that change the ZA state more globally?

Note: With the order of passes (MachineSMEABI -> SMEPeepholeOpt) the marker nodes don't have any meaning at this stage (they're just leftovers), but I think this may matter for the Restore/CommitZASavePseudo.

// These instructions only depend on the ZA state, not the streaming mode,
// so if the pair of smstart/stop is only changing the streaming mode, we
// can permit these instructions.
if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM)
Prev = nullptr;
break;
case AArch64::ADJCALLSTACKDOWN:
case AArch64::ADJCALLSTACKUP:
case AArch64::ANDXri:
Expand Down
12 changes: 1 addition & 11 deletions llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: mov x8, x0
Expand Down Expand Up @@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
; CHECK-NEWLOWERING-NEXT: // %bb.5:
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: .LBB5_6:
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
; CHECK-NEWLOWERING-NEXT: // %bb.7:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_8:
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
Expand Down
127 changes: 126 additions & 1 deletion llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s

declare void @callee()
declare void @callee_sm() "aarch64_pstate_sm_enabled"
Expand Down Expand Up @@ -554,3 +554,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
store <vscale x 4 x float> %res1, ptr %ptr
ret void
}

; normal caller -> streaming callees (with ZA state)
define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
; CHECK-LABEL: test14:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: sub x10, x29, #80
; CHECK-NEXT: stp x9, x8, [x29, #-80]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl callee_sm
; CHECK-NEXT: bl callee_sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #80
; CHECK-NEXT: cbnz x8, .LBB15_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB15_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee_sm()
call void @callee_sm()
ret void
}

; normal caller -> streaming callees (with ZA agnostic state)
define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
; CHECK-LABEL: test15:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state_size
; CHECK-NEXT: sub sp, sp, x0
; CHECK-NEXT: mov x20, sp
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl callee_sm
; CHECK-NEXT: bl callee_sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_restore
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee_sm()
call void @callee_sm()
ret void
}

; locally streaming caller -> normal callees (with ZA state)
define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
; CHECK-LABEL: test16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: stp x9, x8, [x29, #-80]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB17_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: zero {za}
; CHECK-NEXT: .LBB17_2:
; CHECK-NEXT: smstart za
; CHECK-NEXT: smstart sm
; CHECK-NEXT: sub x8, x29, #80
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: bl callee
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: smstop za
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee()
call void @callee()
ret void
}
Loading