From aae8b5beb4b7b6510d728b4c4f42b9016e08127d Mon Sep 17 00:00:00 2001 From: Mary Kassayova Date: Mon, 8 Sep 2025 16:27:06 +0000 Subject: [PATCH 1/4] [AArch64][SME] Introduce CHECK_MATCHING_VL pseudo for safe streaming mode transitions --- .../Target/AArch64/AArch64ISelLowering.cpp | 90 ++++- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 + .../AArch64/sme-callee-save-restore-pairs.ll | 26 +- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 14 +- .../CodeGen/AArch64/sme-streaming-checkvl.ll | 335 ++++++++++++++++++ .../sme-streaming-compatible-interface.ll | 26 +- llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 14 + .../CodeGen/AArch64/spill-reload-remarks.ll | 2 +- 9 files changed, 501 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 351235dd5bcdd..9380522d65aae 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2940,6 +2940,52 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, return NextInst->getParent(); } +MachineBasicBlock * +AArch64TargetLowering::EmitCheckVL(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + DebugLoc DL = MI.getDebugLoc(); + MachineFunction::iterator It = ++MBB->getIterator(); + + const TargetRegisterClass *RC = &AArch64::GPR64RegClass; + MachineRegisterInfo &MRI = MF->getRegInfo(); + + Register RegVL = MRI.createVirtualRegister(RC); + Register RegSVL = MRI.createVirtualRegister(RC); + Register RegCheck = MRI.createVirtualRegister(RC); + + BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1); + BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1); + + BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck) + .addReg(RegVL) + .addReg(RegSVL); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, TrapBB); + MF->insert(It, PassBB); + + BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX)) + .addReg(RegCheck) + .addMBB(PassBB); + + // Transfer rest of current BB to PassBB + PassBB->splice(PassBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + PassBB->transferSuccessorsAndUpdatePHIs(MBB); + + BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1); + + MBB->addSuccessor(TrapBB); + MBB->addSuccessor(PassBB); + + MI.eraseFromParent(); + return PassBB; +} + MachineBasicBlock * AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -3343,6 +3389,9 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::PROBED_STACKALLOC_DYN: return EmitDynamicProbedAlloc(MI, BB); + case AArch64::CHECK_MATCHING_VL: + return EmitCheckVL(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_H: @@ -9116,7 +9165,8 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, - unsigned Condition) const { + unsigned Condition, + bool HasSVECC) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasStreamingModeChanges(true); @@ -9147,7 +9197,40 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, if (InGlue) Ops.push_back(InGlue); - return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + if (!HasSVECC) + return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + + auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue { + SmallVector Ops = {Chain}; + if (InGlue) + Ops.push_back(InGlue); + return SDValue(DAG.getMachineNode(AArch64::CHECK_MATCHING_VL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), + Ops), + 0); + }; + + // NS -> S + if (Enable) { + SDValue CheckVL = GetCheckVL(Chain, InGlue); + + // Replace chain + Ops[0] = CheckVL.getValue(0); + + // Replace/append glue + if (InGlue) + Ops.back() = CheckVL.getValue(1); + else + Ops.push_back(CheckVL.getValue(1)); + + return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + } + + // S -> NS + SDValue StreamingModeInstr = + DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + return GetCheckVL(StreamingModeInstr.getValue(0), + StreamingModeInstr.getValue(1)); } // Emit a call to __arm_sme_save or __arm_sme_restore. @@ -9732,7 +9815,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresSMChange) { Chain = changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(), - Chain, InGlue, getSMToggleCondition(CallAttrs)); + Chain, InGlue, getSMToggleCondition(CallAttrs), + CallConv == CallingConv::AArch64_SVE_VectorCall); InGlue = Chain.getValue(1); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f5d14905cac66..e6385a059b875 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -168,6 +168,9 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitCheckVL(MachineInstr &MI, + MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; @@ -532,8 +535,8 @@ class AArch64TargetLowering : public TargetLowering { /// node. \p Condition should be one of the enum values from /// AArch64SME::ToggleCondition. SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, - SDValue Chain, SDValue InGlue, - unsigned Condition) const; + SDValue Chain, SDValue InGlue, unsigned Condition, + bool HasSVECC = false) const; bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f0020a9a3c91d..7cc3dc9a2171c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1367,6 +1367,12 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs), } // Defs = [SP, NZCV], Uses = [SP] in } // hasSideEffects = 1, isCodeGenOnly = 1 +// Pseudo-instruction that compares the current SVE vector length (VL) with the +// streaming vector length (SVL). If the two lengths do not match, the check +// lowers to a `brk`, causing a trap. +let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in +def CHECK_MATCHING_VL : Pseudo<(outs), (ins), []>, Sched<[]>; + let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat // cannot handle multiple instructions. When that changes, they can be diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll index cf42db7aa65bd..af6ebb3846738 100644 --- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -47,12 +47,19 @@ define void @fbyte( %v) #0{ ; NOPAIR-NEXT: // %bb.1: ; NOPAIR-NEXT: smstop sm ; NOPAIR-NEXT: .LBB0_2: +; NOPAIR-NEXT: rdvl x8, #1 +; NOPAIR-NEXT: rdsvl x9, #1 +; NOPAIR-NEXT: cmp x8, x9 +; NOPAIR-NEXT: b.eq .LBB0_4 +; NOPAIR-NEXT: // %bb.3: +; NOPAIR-NEXT: brk #0x1 +; NOPAIR-NEXT: .LBB0_4: ; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; NOPAIR-NEXT: bl my_func2 -; NOPAIR-NEXT: tbz w19, #0, .LBB0_4 -; NOPAIR-NEXT: // %bb.3: +; NOPAIR-NEXT: tbz w19, #0, .LBB0_6 +; NOPAIR-NEXT: // %bb.5: ; NOPAIR-NEXT: smstart sm -; NOPAIR-NEXT: .LBB0_4: +; NOPAIR-NEXT: .LBB0_6: ; NOPAIR-NEXT: addvl sp, sp, #1 ; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload @@ -127,12 +134,19 @@ define void @fbyte( %v) #0{ ; PAIR-NEXT: // %bb.1: ; PAIR-NEXT: smstop sm ; PAIR-NEXT: .LBB0_2: +; PAIR-NEXT: rdvl x8, #1 +; PAIR-NEXT: rdsvl x9, #1 +; PAIR-NEXT: cmp x8, x9 +; PAIR-NEXT: b.eq .LBB0_4 +; PAIR-NEXT: // %bb.3: +; PAIR-NEXT: brk #0x1 +; PAIR-NEXT: .LBB0_4: ; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; PAIR-NEXT: bl my_func2 -; PAIR-NEXT: tbz w19, #0, .LBB0_4 -; PAIR-NEXT: // %bb.3: +; PAIR-NEXT: tbz w19, #0, .LBB0_6 +; PAIR-NEXT: // %bb.5: ; PAIR-NEXT: smstart sm -; PAIR-NEXT: .LBB0_4: +; PAIR-NEXT: .LBB0_6: ; PAIR-NEXT: addvl sp, sp, #1 ; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 80827c2547780..9ff414d401426 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -527,14 +527,26 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.ne .LBB14_2 +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB14_3 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB14_3: ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll new file mode 100644 index 0000000000000..631d120f3c872 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll @@ -0,0 +1,335 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s -o - | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +declare void @bar_enabled() #0 + +; Non-streaming -> calls streaming callee +define void @foo_non_streaming_pass_arg(ptr %arg) { +; CHECK-LABEL: foo_non_streaming_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB0_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB0_2: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: sub x8, x29, #64 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: bl bar_enabled +; CHECK-NEXT: smstop sm +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 96 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +; Streaming-compatible -> calls streaming callee +define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 { +; CHECK-LABEL: foo_streaming_compatible_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: rdsvl x10, #1 +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: b.eq .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: ldr z0, [x8] +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: tbnz w0, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: bl bar_enabled +; CHECK-NEXT: tbnz w19, #0, .LBB1_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_6: // %entry +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +declare @bar_retv_enabled() #0 + +; Non-streaming -> returns SVE value from streaming callee +define void @foo_non_streaming_retval(ptr %ptr) { +; CHECK-LABEL: foo_non_streaming_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x28, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -56 +; CHECK-NEXT: .cfi_offset b9, -64 +; CHECK-NEXT: .cfi_offset b10, -72 +; CHECK-NEXT: .cfi_offset b11, -80 +; CHECK-NEXT: .cfi_offset b12, -88 +; CHECK-NEXT: .cfi_offset b13, -96 +; CHECK-NEXT: .cfi_offset b14, -104 +; CHECK-NEXT: .cfi_offset b15, -112 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB2_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB2_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl bar_retv_enabled +; CHECK-NEXT: sub x8, x29, #64 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-NEXT: ldp x28, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +; Streaming-compatible -> returns SVE value from streaming callee +define void @foo_streaming_compatible_retval(ptr %ptr) #1 { +; CHECK-LABEL: foo_streaming_compatible_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB3_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB3_2: // %entry +; CHECK-NEXT: tbnz w0, #0, .LBB3_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB3_4: // %entry +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: bl bar_retv_enabled +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: tbnz w20, #0, .LBB3_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB3_6: // %entry +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w20 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +attributes #0 = { "aarch64_pstate_sm_enabled" } +attributes #1 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 9088986ee9b72..95fb68945de44 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -209,13 +209,20 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( %x) #0 { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB3_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: bl scalable_callee ; CHECK-NEXT: smstart sm @@ -472,6 +479,13 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: //APP ; FP-CHECK-NEXT: //NO_APP ; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: rdvl x8, #1 +; FP-CHECK-NEXT: rdsvl x9, #1 +; FP-CHECK-NEXT: cmp x8, x9 +; FP-CHECK-NEXT: b.eq .LBB3_2 +; FP-CHECK-NEXT: // %bb.1: +; FP-CHECK-NEXT: brk #0x1 +; FP-CHECK-NEXT: .LBB3_2: ; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload ; FP-CHECK-NEXT: bl scalable_callee ; FP-CHECK-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll index a23854759d688..9e711fe05f462 100644 --- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll +++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll @@ -2,7 +2,7 @@ ; We should have both spill and reload for %arg. -; CHECK: remark: :0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function +; CHECK: remark: :0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function define @streaming_compatible_with_predicate_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind #0 { %res = call @normal_callee_predicate_vec_arg( %arg) %and = and %res, %arg From ac52098fe468020a6b72819096367496bdc9c292 Mon Sep 17 00:00:00 2001 From: Mary Kassayova Date: Wed, 10 Sep 2025 13:40:05 +0000 Subject: [PATCH 2/4] Addressed comments --- .../Target/AArch64/AArch64ISelLowering.cpp | 37 ++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 - .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 11 + .../CodeGen/AArch64/sme-streaming-checkvl.ll | 191 ++++++++++++++++-- .../CodeGen/AArch64/spill-reload-remarks.ll | 2 +- 6 files changed, 202 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9380522d65aae..19ae0e848a9e6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2941,8 +2941,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, } MachineBasicBlock * -AArch64TargetLowering::EmitCheckVL(MachineInstr &MI, - MachineBasicBlock *MBB) const { +AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, + MachineBasicBlock *MBB) const { MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); @@ -3389,8 +3389,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::PROBED_STACKALLOC_DYN: return EmitDynamicProbedAlloc(MI, BB); - case AArch64::CHECK_MATCHING_VL: - return EmitCheckVL(MI, BB); + case AArch64::CHECK_MATCHING_VL_PSEUDO: + return EmitCheckMatchingVL(MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); @@ -9162,11 +9162,9 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } } -SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, - bool Enable, SDValue Chain, - SDValue InGlue, - unsigned Condition, - bool HasSVECC) const { +SDValue AArch64TargetLowering::changeStreamingMode( + SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, + unsigned Condition, bool InsertVectorLengthCheck) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasStreamingModeChanges(true); @@ -9197,20 +9195,18 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, if (InGlue) Ops.push_back(InGlue); - if (!HasSVECC) + if (!InsertVectorLengthCheck) return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue { SmallVector Ops = {Chain}; if (InGlue) Ops.push_back(InGlue); - return SDValue(DAG.getMachineNode(AArch64::CHECK_MATCHING_VL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), - Ops), - 0); + return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Ops); }; - // NS -> S + // Non-streaming -> Streaming if (Enable) { SDValue CheckVL = GetCheckVL(Chain, InGlue); @@ -9226,7 +9222,7 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); } - // S -> NS + // Streaming -> Non-streaming SDValue StreamingModeInstr = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); return GetCheckVL(StreamingModeInstr.getValue(0), @@ -9813,10 +9809,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue InGlue; if (RequiresSMChange) { - Chain = - changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(), - Chain, InGlue, getSMToggleCondition(CallAttrs), - CallConv == CallingConv::AArch64_SVE_VectorCall); + bool InsertVectorLengthCheck = + (CallConv == CallingConv::AArch64_SVE_VectorCall); + Chain = changeStreamingMode( + DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue, + getSMToggleCondition(CallAttrs), InsertVectorLengthCheck); InGlue = Chain.getValue(1); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e6385a059b875..ff073d3eafb1f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -168,8 +168,8 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const; - MachineBasicBlock *EmitCheckVL(MachineInstr &MI, - MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitCheckMatchingVL(MachineInstr &MI, + MachineBasicBlock *MBB) const; MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -536,7 +536,7 @@ class AArch64TargetLowering : public TargetLowering { /// AArch64SME::ToggleCondition. SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, - bool HasSVECC = false) const; + bool InsertVectorLengthCheck = false) const; bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 7cc3dc9a2171c..f0020a9a3c91d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1367,12 +1367,6 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs), } // Defs = [SP, NZCV], Uses = [SP] in } // hasSideEffects = 1, isCodeGenOnly = 1 -// Pseudo-instruction that compares the current SVE vector length (VL) with the -// streaming vector length (SVL). If the two lengths do not match, the check -// lowers to a `brk`, causing a trap. -let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in -def CHECK_MATCHING_VL : Pseudo<(outs), (ins), []>, Sched<[]>; - let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat // cannot handle multiple instructions. When that changes, they can be diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 601dc34d74b9c..430b7382de216 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -48,6 +48,17 @@ let usesCustomInserter = 1 in { } def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>; +// Pseudo-instruction that compares the current SVE vector length (VL) with the +// streaming vector length (SVL). If the two lengths do not match, the check +// lowers to a `brk`, causing a trap. +let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in +def CHECK_MATCHING_VL_PSEUDO : Pseudo<(outs), (ins), []>, Sched<[]>; + +def AArch64_check_matching_vl + : SDNode<"AArch64ISD::CHECK_MATCHING_VL", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def : Pat<(AArch64_check_matching_vl), (CHECK_MATCHING_VL_PSEUDO)>; + //===----------------------------------------------------------------------===// // Old SME ABI lowering ISD nodes/pseudos (deprecated) //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll index 631d120f3c872..1f36fd13fbaa3 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll @@ -1,9 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" declare void @bar_enabled() #0 +declare void @bar() +declare @bar_retv_enabled() #0 +declare @bar_retv() ; Non-streaming -> calls streaming callee define void @foo_non_streaming_pass_arg(ptr %arg) { @@ -107,24 +110,22 @@ define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 { ; CHECK-NEXT: .cfi_offset b15, -1136 ; CHECK-NEXT: sub sp, sp, #1024 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: rdsvl x10, #1 -; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: mrs x19, SVCR +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.eq .LBB1_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB1_2: // %entry -; CHECK-NEXT: ldr z0, [x8] +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: sub x8, x29, #1088 ; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: tbnz w0, #0, .LBB1_4 +; CHECK-NEXT: tbnz w19, #0, .LBB1_4 ; CHECK-NEXT: // %bb.3: // %entry ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: // %entry ; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: bl bar_enabled ; CHECK-NEXT: tbnz w19, #0, .LBB1_6 ; CHECK-NEXT: // %bb.5: // %entry @@ -163,7 +164,75 @@ entry: ret void } -declare @bar_retv_enabled() #0 +; Streaming -> calls non-streaming callee +define void @foo_streaming_pass_arg(ptr %arg) #0 { +; CHECK-LABEL: foo_streaming_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1120 +; CHECK-NEXT: .cfi_def_cfa_offset 1120 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -1064 +; CHECK-NEXT: .cfi_offset b9, -1072 +; CHECK-NEXT: .cfi_offset b10, -1080 +; CHECK-NEXT: .cfi_offset b11, -1088 +; CHECK-NEXT: .cfi_offset b12, -1096 +; CHECK-NEXT: .cfi_offset b13, -1104 +; CHECK-NEXT: .cfi_offset b14, -1112 +; CHECK-NEXT: .cfi_offset b15, -1120 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 2144 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB2_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB2_2: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: bl bar +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1120 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1120 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar( %v) + ret void +} ; Non-streaming -> returns SVE value from streaming callee define void @foo_non_streaming_retval(ptr %ptr) { @@ -197,10 +266,10 @@ define void @foo_non_streaming_retval(ptr %ptr) { ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB2_2 +; CHECK-NEXT: b.eq .LBB3_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 -; CHECK-NEXT: .LBB2_2: // %entry +; CHECK-NEXT: .LBB3_2: // %entry ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl bar_retv_enabled @@ -273,27 +342,26 @@ define void @foo_streaming_compatible_retval(ptr %ptr) #1 { ; CHECK-NEXT: .cfi_offset b15, -1136 ; CHECK-NEXT: sub sp, sp, #1024 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB3_2 +; CHECK-NEXT: b.eq .LBB4_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 -; CHECK-NEXT: .LBB3_2: // %entry -; CHECK-NEXT: tbnz w0, #0, .LBB3_4 +; CHECK-NEXT: .LBB4_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: tbnz w20, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: // %entry ; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB3_4: // %entry -; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: .LBB4_4: // %entry ; CHECK-NEXT: bl bar_retv_enabled ; CHECK-NEXT: sub x8, x29, #1088 ; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: tbnz w20, #0, .LBB3_6 +; CHECK-NEXT: tbnz w20, #0, .LBB4_6 ; CHECK-NEXT: // %bb.5: // %entry ; CHECK-NEXT: smstop sm -; CHECK-NEXT: .LBB3_6: // %entry +; CHECK-NEXT: .LBB4_6: // %entry ; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -331,5 +399,86 @@ entry: ret void } +; Streaming -> returns SVE value from non-streaming callee +define void @foo_streaming_retval(ptr %ptr) #0 { +; CHECK-LABEL: foo_streaming_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.eq .LBB5_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB5_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: bl bar_retv +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv() + store %v, ptr %ptr, align 16 + ret void +} + attributes #0 = { "aarch64_pstate_sm_enabled" } attributes #1 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll index 9e711fe05f462..33a4ecd56e35b 100644 --- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll +++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll @@ -2,7 +2,7 @@ ; We should have both spill and reload for %arg. -; CHECK: remark: :0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function +; CHECK: remark: :0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost generated in function define @streaming_compatible_with_predicate_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind #0 { %res = call @normal_callee_predicate_vec_arg( %arg) %and = and %res, %arg From f3f31c9ed12d892e815b5df5253aae26e3b0cf07 Mon Sep 17 00:00:00 2001 From: Mary Kassayova Date: Thu, 11 Sep 2025 16:08:57 +0000 Subject: [PATCH 3/4] Added MIR test & some comments --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 + .../AArch64/sme-streaming-checkvl-mir.ll | 209 ++++++++++++++++++ .../CodeGen/AArch64/sme-streaming-checkvl.ll | 2 +- 3 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 19ae0e848a9e6..54349ae5ceb67 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2956,9 +2956,11 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, Register RegSVL = MRI.createVirtualRegister(RC); Register RegCheck = MRI.createVirtualRegister(RC); + // Read VL and Streaming VL BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1); BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1); + // Compare vector lengths BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck) .addReg(RegVL) .addReg(RegSVL); @@ -2968,6 +2970,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, MF->insert(It, TrapBB); MF->insert(It, PassBB); + // Continue if vector lengths match BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX)) .addReg(RegCheck) .addMBB(PassBB); @@ -2977,6 +2980,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); PassBB->transferSuccessorsAndUpdatePHIs(MBB); + // Trap if vector lengths mismatch BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1); MBB->addSuccessor(TrapBB); diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll new file mode 100644 index 0000000000000..8ea3e1f71c7ad --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-before=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BEFORE-ISEL +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-AFTER-ISEL + +target triple = "aarch64-unknown-linux-gnu" + +declare void @bar_enabled() #0 +declare void @bar() +declare @bar_retv_enabled() #0 +declare @bar_retv() + +; Non-streaming -> calls streaming callee +define void @foo_non_streaming_pass_arg(ptr %arg) { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_pass_arg + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_pass_arg + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-AFTER-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +; Streaming -> calls non-streaming callee +define void @foo_streaming_pass_arg(ptr %arg) #0 { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_pass_arg + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-BEFORE-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_pass_arg + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-AFTER-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = load , ptr %arg, align 16 + tail call void @bar( %v) + ret void +} + +; Non-streaming -> returns SVE value from streaming callee +define void @foo_non_streaming_retval(ptr %ptr) { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_retval + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_retval + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +; Streaming -> returns SVE value from non-streaming callee +define void @foo_streaming_retval(ptr %ptr) #0 { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_retval + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_retval + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = tail call @bar_retv() + store %v, ptr %ptr, align 16 + ret void +} + +attributes #0 = { "aarch64_pstate_sm_enabled" } +attributes #1 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll index 1f36fd13fbaa3..8c197ef97b116 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -verify-machineinstrs < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" From 3a2c02a6a71737eae4c286ff417404b99d04c31d Mon Sep 17 00:00:00 2001 From: Mary Kassayova Date: Mon, 15 Sep 2025 14:05:39 +0000 Subject: [PATCH 4/4] Replaced RDSVL+SUB with ADDSVL, simplified changeStreamingMode --- .../Target/AArch64/AArch64ISelLowering.cpp | 93 ++++++++++--------- .../AArch64/sme-callee-save-restore-pairs.ll | 10 +- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 10 +- .../AArch64/sme-streaming-checkvl-mir.ll | 40 ++++---- .../CodeGen/AArch64/sme-streaming-checkvl.ll | 34 +++---- .../sme-streaming-compatible-interface.ll | 10 +- llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 10 +- 7 files changed, 99 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 54349ae5ceb67..df50375ea3763 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2944,27 +2944,34 @@ MachineBasicBlock * AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const { MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - DebugLoc DL = MI.getDebugLoc(); - MachineFunction::iterator It = ++MBB->getIterator(); - - const TargetRegisterClass *RC = &AArch64::GPR64RegClass; MachineRegisterInfo &MRI = MF->getRegInfo(); - Register RegVL = MRI.createVirtualRegister(RC); - Register RegSVL = MRI.createVirtualRegister(RC); - Register RegCheck = MRI.createVirtualRegister(RC); + const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass; + const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass; - // Read VL and Streaming VL - BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1); - BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1); + Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR); + Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src + Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR); + Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst - // Compare vector lengths - BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck) - .addReg(RegVL) - .addReg(RegSVL); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // RDVL requires GPR64, ADDSVL requires GPR64sp + // We need to insert COPY instructions, these will later be removed by the + // RegisterCoalescer + BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp) + .addReg(RegVL_GPR); + BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp) + .addReg(RegVL_GPRsp) + .addImm(-1); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR) + .addReg(RegSVL_GPRsp); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction::iterator It = ++MBB->getIterator(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, TrapBB); @@ -2972,7 +2979,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, // Continue if vector lengths match BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX)) - .addReg(RegCheck) + .addReg(RegSVL_GPR) .addMBB(PassBB); // Transfer rest of current BB to PassBB @@ -9173,6 +9180,22 @@ SDValue AArch64TargetLowering::changeStreamingMode( AArch64FunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasStreamingModeChanges(true); + auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue { + SmallVector Ops = {Chain}; + if (InGlue) + Ops.push_back(InGlue); + return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Ops); + }; + + if (InsertVectorLengthCheck && Enable) { + // Non-streaming -> Streaming + // Insert vector length check before smstart + SDValue CheckVL = GetCheckVL(Chain, InGlue); + Chain = CheckVL.getValue(0); + InGlue = CheckVL.getValue(1); + } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); SDValue MSROp = @@ -9199,38 +9222,16 @@ SDValue AArch64TargetLowering::changeStreamingMode( if (InGlue) Ops.push_back(InGlue); - if (!InsertVectorLengthCheck) - return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); - - auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue { - SmallVector Ops = {Chain}; - if (InGlue) - Ops.push_back(InGlue); - return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), Ops); - }; - - // Non-streaming -> Streaming - if (Enable) { - SDValue CheckVL = GetCheckVL(Chain, InGlue); - - // Replace chain - Ops[0] = CheckVL.getValue(0); - - // Replace/append glue - if (InGlue) - Ops.back() = CheckVL.getValue(1); - else - Ops.push_back(CheckVL.getValue(1)); + SDValue SMChange = + DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); - return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); - } + if (!InsertVectorLengthCheck || Enable) + return SMChange; // Streaming -> Non-streaming - SDValue StreamingModeInstr = - DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); - return GetCheckVL(StreamingModeInstr.getValue(0), - StreamingModeInstr.getValue(1)); + // Insert vector length check after smstop since we cannot read VL + // in streaming mode + return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } // Emit a call to __arm_sme_save or __arm_sme_restore. diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll index af6ebb3846738..b58a857f3a3cb 100644 --- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -48,9 +48,8 @@ define void @fbyte( %v) #0{ ; NOPAIR-NEXT: smstop sm ; NOPAIR-NEXT: .LBB0_2: ; NOPAIR-NEXT: rdvl x8, #1 -; NOPAIR-NEXT: rdsvl x9, #1 -; NOPAIR-NEXT: cmp x8, x9 -; NOPAIR-NEXT: b.eq .LBB0_4 +; NOPAIR-NEXT: addsvl x8, x8, #-1 +; NOPAIR-NEXT: cbz x8, .LBB0_4 ; NOPAIR-NEXT: // %bb.3: ; NOPAIR-NEXT: brk #0x1 ; NOPAIR-NEXT: .LBB0_4: @@ -135,9 +134,8 @@ define void @fbyte( %v) #0{ ; PAIR-NEXT: smstop sm ; PAIR-NEXT: .LBB0_2: ; PAIR-NEXT: rdvl x8, #1 -; PAIR-NEXT: rdsvl x9, #1 -; PAIR-NEXT: cmp x8, x9 -; PAIR-NEXT: b.eq .LBB0_4 +; PAIR-NEXT: addsvl x8, x8, #-1 +; PAIR-NEXT: cbz x8, .LBB0_4 ; PAIR-NEXT: // %bb.3: ; PAIR-NEXT: brk #0x1 ; PAIR-NEXT: .LBB0_4: diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 9ff414d401426..1659b217ce0be 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -530,9 +530,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB14_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbnz x8, .LBB14_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x19, x0 @@ -541,9 +540,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB14_3 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB14_3 ; CHECK-NEXT: .LBB14_2: ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB14_3: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll index 8ea3e1f71c7ad..0ac46085d683f 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll @@ -35,9 +35,10 @@ define void @foo_non_streaming_pass_arg(ptr %arg) { ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] - ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 ; CHECK-AFTER-ISEL-NEXT: {{ $}} ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: ; CHECK-AFTER-ISEL-NEXT: successors: @@ -84,9 +85,10 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 { ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] - ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 ; CHECK-AFTER-ISEL-NEXT: {{ $}} ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: ; CHECK-AFTER-ISEL-NEXT: successors: @@ -131,9 +133,10 @@ define void @foo_non_streaming_retval(ptr %ptr) { ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] - ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 ; CHECK-AFTER-ISEL-NEXT: {{ $}} ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: ; CHECK-AFTER-ISEL-NEXT: successors: @@ -144,10 +147,10 @@ define void @foo_non_streaming_retval(ptr %ptr) { ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0 ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr - ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] - ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store () into %ir.ptr) ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR entry: %v = tail call @bar_retv_enabled() #0 @@ -182,9 +185,10 @@ define void @foo_streaming_retval(ptr %ptr) #0 { ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg - ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]] - ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 ; CHECK-AFTER-ISEL-NEXT: {{ $}} ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: ; CHECK-AFTER-ISEL-NEXT: successors: @@ -194,10 +198,10 @@ define void @foo_streaming_retval(ptr %ptr) #0 { ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0 ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr - ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] - ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store () into %ir.ptr) ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR entry: %v = tail call @bar_retv() diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll index 8c197ef97b116..a1eb1ceeaf19b 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll @@ -36,9 +36,8 @@ define void @foo_non_streaming_pass_arg(ptr %arg) { ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB0_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB0_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB0_2: // %entry @@ -110,11 +109,10 @@ define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 { ; CHECK-NEXT: .cfi_offset b15, -1136 ; CHECK-NEXT: sub sp, sp, #1024 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB1_2 +; CHECK-NEXT: mrs x19, SVCR +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB1_2: // %entry @@ -195,9 +193,8 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 { ; CHECK-NEXT: .cfi_def_cfa_offset 2144 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB2_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB2_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB2_2: // %entry @@ -264,9 +261,8 @@ define void @foo_non_streaming_retval(ptr %ptr) { ; CHECK-NEXT: .cfi_offset b15, -112 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB3_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB3_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB3_2: // %entry @@ -342,11 +338,10 @@ define void @foo_streaming_compatible_retval(ptr %ptr) #1 { ; CHECK-NEXT: .cfi_offset b15, -1136 ; CHECK-NEXT: sub sp, sp, #1024 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB4_2 +; CHECK-NEXT: mrs x20, SVCR +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB4_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB4_2: // %entry @@ -434,9 +429,8 @@ define void @foo_streaming_retval(ptr %ptr) #0 { ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB5_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB5_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB5_2: // %entry diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 95fb68945de44..f2163ad15bafc 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -210,9 +210,8 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( %x) #0 { ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: smstop sm ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.eq .LBB3_2 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: brk #0x1 ; CHECK-NEXT: .LBB3_2: @@ -480,9 +479,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: //NO_APP ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: rdvl x8, #1 -; FP-CHECK-NEXT: rdsvl x9, #1 -; FP-CHECK-NEXT: cmp x8, x9 -; FP-CHECK-NEXT: b.eq .LBB3_2 +; FP-CHECK-NEXT: addsvl x8, x8, #-1 +; FP-CHECK-NEXT: cbz x8, .LBB3_2 ; FP-CHECK-NEXT: // %bb.1: ; FP-CHECK-NEXT: brk #0x1 ; FP-CHECK-NEXT: .LBB3_2: