diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f552f91929201..fdef6b752f6b3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2435,6 +2435,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER) + MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ) MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) @@ -2929,6 +2931,80 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + // Store the buffer pointer to the TPIDR2 stack object. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(MI.getOperand(0).getReg()) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(0); + // Set the reserved bytes (10-15) to zero + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(5); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(3); + } else + MFI.RemoveStackObject(TPIDR2.FrameIndex); + + BB->remove_instr(&MI); + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo(); + // TODO This function grows the stack with a subtraction, which doesn't work + // on Windows. Some refactoring to share the functionality in + // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI + // supports SME + assert(!MF->getSubtarget().isTargetWindows() && + "Lazy ZA save is not yet supported on Windows"); + + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // The SUBXrs below won't always be emitted in a form that accepts SP + // directly + Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + auto Size = MI.getOperand(1).getReg(); + auto Dest = MI.getOperand(0).getReg(); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest) + .addReg(Size) + .addReg(Size) + .addReg(SP); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), + AArch64::SP) + .addReg(Dest); + + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + BB->remove_instr(&MI); + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -2959,7 +3035,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); - + case AArch64::InitTPIDR2Obj: + return EmitInitTPIDR2Object(MI, BB); + case AArch64::ExpandZABuffer: + return EmitExpandZABuffer(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: @@ -6854,47 +6933,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { } } - -unsigned -AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); - SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); - Chain = Buffer.getValue(1); - MFI.CreateVariableSizedObject(Align(1), nullptr); - - // Allocate an additional TPIDR2 object on the stack (16 bytes) - unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); - - // Store the buffer pointer to the TPIDR2 stack object. - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue Ptr = DAG.getFrameIndex( - TPIDR2Obj, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); - - // Set the reserved bytes (10-15) to zero - EVT PtrTy = Ptr.getValueType(); - SDValue ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr, - MPI); - ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr, - MPI); - - return TPIDR2Obj; -} - static bool isPassedInFPR(EVT VT) { return VT.isFixedLengthVector() || (VT.isFloatingPoint() && !VT.isScalableVector()); @@ -7311,10 +7349,28 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - // Conservatively assume the function requires the lazy-save mechanism. + // Create a 16 Byte TPIDR2 object. The dynamic buffer + // will be expanded and stored in the static object later using a pseudonode. if (SMEAttrs(MF.getFunction()).hasZAState()) { - unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); - FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); + } else { + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + Chain = DAG.getNode( + AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); } return Chain; @@ -7985,9 +8041,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, + const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + MachinePointerInfo MPI = + MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); + SDValue TPIDR2ObjAddr = DAG.getFrameIndex( + TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, @@ -8502,7 +8560,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. - unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( @@ -8515,7 +8573,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // RESTORE_ZA pseudo. SDValue Glue; SDValue TPIDR2Block = DAG.getFrameIndex( - FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, @@ -8527,6 +8586,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, ISD::INTRINSIC_VOID, DL, MVT::Other, Result, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; } if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 3465f3be88754..50082bed61ec7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -444,6 +444,8 @@ enum NodeType : unsigned { // SME RDSVL, REVD_MERGE_PASSTHRU, + ALLOCATE_ZA_BUFFER, + INIT_TPIDR2OBJ, // Asserts that a function argument (i32) is zero-extended to i8 by // the caller @@ -642,6 +644,10 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -1010,9 +1016,6 @@ class AArch64TargetLowering : public TargetLowering { void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); - unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const; - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d5941e6284111..0f33b237b3de7 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -35,6 +35,11 @@ struct AArch64FunctionInfo; class AArch64Subtarget; class MachineInstr; +struct TPIDR2Object { + int FrameIndex = std::numeric_limits::max(); + unsigned Uses = 0; +}; + /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { @@ -195,7 +200,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool IsSVECC = false; /// The frame-index for the TPIDR2 object used for lazy saves. - Register LazySaveTPIDR2Obj = 0; + TPIDR2Object TPIDR2; /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -226,8 +231,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; } - void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; } + TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 2db0fa2534345..8eccf74cc90e6 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -31,6 +31,22 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, def AArch64CoalescerBarrier : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { + def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} +} +def : Pat<(i64 (AArch64ExpandZABuffer GPR64:$size)), + (ExpandZABuffer $size)>; + +def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, + [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; +let usesCustomInserter = 1 in { + def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {} +} + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index cd348be5d771d..4e2711d396d5e 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -214,16 +214,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za" define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: ; CHECK-COMMON: // %bb.0: // %prelude -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-COMMON-NEXT: mov x29, sp -; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -239,8 +231,7 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ret entry: %call = call double @za_shared_callee(double %x) @@ -298,12 +289,12 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -360,12 +351,12 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 9d635f0b88f19..587d60d7f1f73 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -15,12 +15,12 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -50,9 +50,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w19, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee @@ -95,12 +95,12 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -133,12 +133,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh wzr, [x29, #-70] ; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sub x9, x29, #80 +; CHECK-NEXT: sturh wzr, [x29, #-70] +; CHECK-NEXT: stur wzr, [x29, #-68] ; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB3_2 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index cd7460b177c4b..393ff3b79aedf 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -14,12 +14,12 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -47,12 +47,12 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll new file mode 100644 index 0000000000000..ad3f7f5514d0e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +define i32 @no_tpidr2_save_required() "aarch64_inout_za" { +; CHECK-LABEL: no_tpidr2_save_required: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #42 // =0x2a +; CHECK-NEXT: ret +entry: + ret i32 42 +} + +define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { +; CHECK-LABEL: multi_bb_stpidr2_save_required: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: cbz w0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %use_b +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: .LBB1_2: // %use_c +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %use_c +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_4: // %use_c +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB1_5: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %cmp = icmp ne i32 %a, 0 + br i1 %cmp, label %use_b, label %use_c + +use_b: + %faddr = fadd float %b, 4.0 + br label %exit + +use_c: + %res2 = call float @llvm.cos.f32(float %c) + br label %exit + +exit: + %ret = phi float [%faddr, %use_b], [%res2, %use_c] + ret float %ret +} + +define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { +; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: str xzr, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB2_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: ldr xzr, [sp] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: cbz w0, .LBB2_5 +; CHECK-NEXT: // %bb.4: // %use_b +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: b .LBB2_8 +; CHECK-NEXT: .LBB2_5: // %use_c +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB2_7 +; CHECK-NEXT: // %bb.6: // %use_c +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_7: // %use_c +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB2_8: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %cmp = icmp ne i32 %a, 0 + br i1 %cmp, label %use_b, label %use_c + +use_b: + %faddr = fadd float %b, 4.0 + br label %exit + +use_c: + %res2 = call float @llvm.cos.f32(float %c) + br label %exit + +exit: + %ret = phi float [%faddr, %use_b], [%res2, %use_c] + ret float %ret +} + +declare float @llvm.cos.f32(float) diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 7f40b5e7e1344..312537630e77a 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -41,13 +41,13 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za @@ -87,24 +87,14 @@ define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind { define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za"; ret void; @@ -114,19 +104,9 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64 define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; @@ -192,20 +172,12 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind { define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: new_za_zt0_caller: ; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %save.za -; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: str zt0, [x8] ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: ldr zt0, [x8] @@ -216,8 +188,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstop za -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; @@ -227,20 +199,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: new_za_shared_zt0_caller: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero {za} ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; @@ -250,20 +212,10 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: shared_za_new_zt0: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void;