From 244e62b960ff49f3ba3e2d44234943183481006c Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 20 Oct 2025 15:46:00 +0200 Subject: [PATCH 1/3] [AMDGPU] Move getScratchScaleFactor to ST. NFC --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 16 ++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cb27f474d78f3..f9f8c196aeb33 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1057,6 +1057,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // dynamic realignment in common cases. Align getStackAlignment() const { return Align(16); } + unsigned getScratchScaleFactor() const { + return enableFlatScratch() ? 1 : getWavefrontSize(); + } + bool enableMachineScheduler() const override { return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index ffbb111d42221..040553d6ea7da 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -590,10 +590,6 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( return ScratchRsrcReg; } -static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { - return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -693,7 +689,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); - unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); + unsigned Offset = FrameInfo.getStackSize() * ST.getScratchScaleFactor(); if (!mayReserveScratchForCWSR(MF)) { if (hasFP(MF)) { Register FPReg = MFI->getFrameOffsetReg(); @@ -1231,7 +1227,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(StackPtrReg != AMDGPU::SP_REG); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) - .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); + .addImm(MFI.getStackSize() * ST.getScratchScaleFactor()); } } @@ -1292,11 +1288,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // s_and_b32 s33, s33, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) .addReg(StackPtrReg) - .addImm((Alignment - 1) * getScratchScaleFactor(ST)) + .addImm((Alignment - 1) * ST.getScratchScaleFactor()) .setMIFlag(MachineInstr::FrameSetup); auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) .addReg(FramePtrReg, RegState::Kill) - .addImm(-Alignment * getScratchScaleFactor(ST)) + .addImm(-Alignment * ST.getScratchScaleFactor()) .setMIFlag(MachineInstr::FrameSetup); And->getOperand(3).setIsDead(); // Mark SCC as dead. FuncInfo->setIsStackRealigned(true); @@ -1327,7 +1323,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (HasFP && RoundedSize != 0) { auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * getScratchScaleFactor(ST)) + .addImm(RoundedSize * ST.getScratchScaleFactor()) .setMIFlag(MachineInstr::FrameSetup); Add->getOperand(3).setIsDead(); // Mark SCC as dead. } @@ -2137,7 +2133,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( const SIMachineFunctionInfo *MFI = MF.getInfo(); Register SPReg = MFI->getStackPtrOffsetReg(); - Amount *= getScratchScaleFactor(ST); + Amount *= ST.getScratchScaleFactor(); if (IsDestroy) Amount = -Amount; auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) From 7361d2ae91aedfb6c0950484572c86db0e84a013 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 3 Oct 2025 13:51:55 +0200 Subject: [PATCH 2/3] [AMDGPU] Update machine frame info during inlining Update some of the machine frame info while inlining functions. The stack of the caller will now contain an additional object representing the stacks of its callees that have been inlined. Also update some other info such as HasCalls and a few other pieces of info that are trivial to update (this isn't very thorough or exhaustive, and notably doesn't handle tail calls). --- .../AMDGPU/AMDGPUMachineLevelInliner.cpp | 71 ++ .../Target/AMDGPU/AMDGPUMachineLevelInliner.h | 16 + .../amdgpu-machine-level-inliner-mfi.mir | 651 ++++++++++++++++++ .../AMDGPU/amdgpu-machine-level-inliner.ll | 123 ++++ .../AMDGPU/amdgpu-machine-level-inliner.mir | 99 +++ .../AMDGPU/pal-metadata-3.6-inliner.ll | 199 ++++++ 6 files changed, 1159 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp index 8a586ddbfdfa5..18960bab86cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp @@ -10,6 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUMachineModuleInfo.h" #include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" @@ -128,15 +129,20 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { if (!MFI.hasCalls() && !MFI.hasTailCall()) return false; + MaxInlinedCalleeStackSize = 0; + HasInlinedVarSizedStack = false; + // Collect calls to inline. SmallVector CallsToInline; const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + size_t CallsFound = 0; for (auto &MBB : MF) { for (auto &MI : MBB) { if (!MI.isCall()) continue; + CallsFound++; const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); if (CalleeOp && CalleeOp->isGlobal()) { @@ -156,6 +162,11 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { } } + // Reset HasCalls if we're about to inline all of them. This will be updated + // further during inlining if any of the callees introduces its own calls. + // FIXME: HasTailCall! + MFI.setHasCalls(CallsFound != CallsToInline.size()); + // Perform the actual inlining. for (MachineInstr *CallMI : CallsToInline) { const MachineOperand *CalleeOp = @@ -176,6 +187,14 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { Changed = true; } + if (Changed) { + if (MaxInlinedCalleeStackSize != 0) + createCalleeStackObject(MFI); + + if (HasInlinedVarSizedStack) + MFI.CreateVariableSizedObject(Align(1), /*Alloca=*/nullptr); + } + return Changed; } @@ -184,6 +203,9 @@ void AMDGPUMachineLevelInliner::inlineMachineFunction(MachineFunction *CallerMF, MachineFunction *CalleeMF, const SIInstrInfo *TII) { + // TODO: update SIMachineFunctionInfo (e.g. Occupancy) + updateCallerFrameInfo(CallerMF->getFrameInfo(), *CalleeMF); + MachineBasicBlock *CallMBB = CallMI->getParent(); MachineBasicBlock *ContinuationMBB = CallMBB->splitAt(*CallMI, /*UpdateLiveIns=*/true); @@ -287,6 +309,55 @@ void AMDGPUMachineLevelInliner::cleanupAfterInlining( MI->eraseFromParent(); } +void AMDGPUMachineLevelInliner::updateCallerFrameInfo( + MachineFrameInfo &CallerMFI, const MachineFunction &CalleeMF) { + const MachineFrameInfo &CalleeMFI = CalleeMF.getFrameInfo(); + const GCNSubtarget &ST = CalleeMF.getSubtarget(); + const TargetRegisterInfo &TRI = *ST.getRegisterInfo(); + + // Follow the prologue logic. + uint64_t CalleeStackSize = CalleeMFI.getStackSize(); + if (TRI.hasStackRealignment(CalleeMF)) + CalleeStackSize += CalleeMFI.getMaxAlign().value(); + uint64_t TrueCalleeStackSize = CalleeStackSize * ST.getScratchScaleFactor(); + + // Only one of the stacks of the callees will + // be active at any given time, so we only need to make sure the largest one + // fits. + MaxInlinedCalleeStackSize = + std::max(MaxInlinedCalleeStackSize, TrueCalleeStackSize); + + // Track if any callee has variable-sized stack objects. + if (CalleeMFI.hasVarSizedObjects()) + HasInlinedVarSizedStack = true; + +#define SET_IF_ANY(SETTER, GETTER) \ + CallerMFI.SETTER(CallerMFI.GETTER() || CalleeMFI.GETTER()) + + SET_IF_ANY(setHasCalls, hasCalls); + SET_IF_ANY(setHasTailCall, hasTailCall); + SET_IF_ANY(setAdjustsStack, adjustsStack); + SET_IF_ANY(setFrameAddressIsTaken, isFrameAddressTaken); + SET_IF_ANY(setReturnAddressIsTaken, isReturnAddressTaken); + SET_IF_ANY(setHasVAStart, hasVAStart); + SET_IF_ANY(setHasMustTailInVarArgFunc, hasMustTailInVarArgFunc); + SET_IF_ANY(setHasOpaqueSPAdjustment, hasOpaqueSPAdjustment); + SET_IF_ANY(setHasCopyImplyingStackAdjustment, hasCopyImplyingStackAdjustment); + +#undef SET_IF_ANY +} + +void AMDGPUMachineLevelInliner::createCalleeStackObject( + MachineFrameInfo &CallerMFI) { + // Create a stack object representing the maximum callee stack space + uint64_t CallerStackSize = CallerMFI.getStackSize(); + int CalleeStackIdx = + CallerMFI.CreateStackObject(MaxInlinedCalleeStackSize, Align(1), + /*isSpillSlot=*/false); + CallerMFI.setObjectOffset(CalleeStackIdx, CallerStackSize); + CallerMFI.setStackSize(CallerStackSize + MaxInlinedCalleeStackSize); +} + FunctionPass *llvm::createAMDGPUMachineLevelInlinerPass() { return new AMDGPUMachineLevelInliner(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h index ab5ecdc5dbd41..51a2e494247a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h @@ -27,6 +27,7 @@ namespace llvm { +class GCNSubtarget; class SIInstrInfo; class AMDGPUMachineLevelInliner : public MachineFunctionPass { @@ -52,6 +53,21 @@ class AMDGPUMachineLevelInliner : public MachineFunctionPass { void cleanupAfterInlining(MachineFunction *CallerMF, MachineInstr *CallMI, const SIInstrInfo *TII) const; + + void updateCallerFrameInfo(MachineFrameInfo &CallerMFI, + const MachineFunction &CalleeMF); + + /// Create a stack object representing the stacks of all the inlined callees. + /// Its size will be large enough to accomodate the callee with the largest + /// stack. + void createCalleeStackObject(MachineFrameInfo &CallerMFI); + + /// The maximum stack size among all inlined callees (including any padding + /// required to ensure proper alignment). + uint64_t MaxInlinedCalleeStackSize = 0; + + /// Whether any inlined callee has variable-sized stack objects. + bool HasInlinedVarSizedStack = false; }; } // end namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir new file mode 100644 index 0000000000000..bcda11a2419a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir @@ -0,0 +1,651 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -amdgpu-enable-machine-level-inliner -run-pass=amdgpu-inlining-anchor,amdgpu-machine-level-inliner %s -o - | FileCheck %s + +# Test that the inliner correctly updates the MachineFunctionInfo + +--- | + ; Test that we update the frame info for the caller with info from the callee. + ; In particular, hasCalls should be false after inlining. + define amdgpu_gfx_whole_wave i32 @wwf_with_local_no_calls(i1 %mask, i32 %x) { + %local = alloca i32, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_with_local_no_calls(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + ; Same as above, but also make sure we reuse stack space between different callees. + define amdgpu_cs void @inline_wwf_with_local_twice(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + + ; Test callees with different stack sizes and alignments. + define amdgpu_gfx_whole_wave i32 @wwf_large_stack_small_align(i1 %mask) { + %local = alloca i32, i32 512, align 4, addrspace(5) + ret i32 0 + } + define amdgpu_gfx_whole_wave i32 @wwf_small_stack_large_align(i1 %mask) { + %local = alloca i32, align 1024, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_different_stack_shapes() { + ret void + } + + ; Test dynamic stack allocations. + define amdgpu_gfx_whole_wave i32 @wwf_dyn_stack(i1 %mask, i32 inreg %size) { + %local = alloca i32, i32 %size, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_dyn_stack_callee(i32 inreg %size, ptr addrspace(1) %output) { ret void } + + ; Test that we correctly handle stack arguments. + define amdgpu_gfx_whole_wave i32 @wwf_with_stack_args(i1 %active, <33 x i32> %vec) { ret i32 0 } + define amdgpu_cs void @inline_wwf_with_stack_args(i32 %x, i32 %y, ptr addrspace(1) %output) { ret void } + + ; Test that we update hasCalls if the callee contains its own calls. + define amdgpu_gfx_whole_wave i32 @wwf_with_calls(i1 %mask, i32 %x) { ret i32 0} + define amdgpu_cs void @inline_wwf_with_calls(i32 %y) { ret void } + + ; Test that hasCalls is still correct if the caller has other calls. + define amdgpu_gfx i32 @wont_inline() { ret i32 0 } + define amdgpu_cs void @inline_wwf_without_calls(i32 %y) { ret void } +... +--- +name: wwf_with_local_no_calls +tracksRegLiveness: true +frameInfo: + stackSize: 16 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_local_no_calls +# CHECK-LABEL: name: inline_wwf_with_local_no_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 8 +# CHECK: frameInfo: +# CHECK: stackSize: 24 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 4 +# CHECK: adjustsStack: false +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 2, name: '', type: default, offset: 8, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + scavengeFI: '%stack.1' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK scavengeFI: '%stack.1' +# CHECK isWholeWaveFunction: false + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + S_ENDPGM 0 +... +name: inline_wwf_with_local_twice +# CHECK-LABEL: name: inline_wwf_with_local_twice +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 8 +# CHECK: frameInfo: +# CHECK-NEXT: isFrameAddressTaken: false +# CHECK-NEXT: isReturnAddressTaken: false +# CHECK-NEXT: hasStackMap: false +# CHECK-NEXT: hasPatchPoint: false +# CHECK-NEXT: stackSize: 24 +# CHECK-NEXT: offsetAdjustment: 0 +# CHECK-NEXT: maxAlignment: 4 +# CHECK-NEXT: adjustsStack: false +# CHECK-NEXT: hasCalls: false +# CHECK-NEXT: stackProtector: '' +# CHECK-NEXT: functionContext: '' +# CHECK-NEXT: maxCallFrameSize: 4294967295 +# CHECK-NEXT: cvBytesOfCalleeSavedRegisters: 0 +# CHECK-NEXT: hasOpaqueSPAdjustment: false +# CHECK-NEXT: hasVAStart: false +# CHECK-NEXT: hasMustTailInVarArgFunc: false +# CHECK-NEXT: hasTailCall: false +# CHECK-NEXT: isCalleeSavedInfoValid: false +# CHECK-NEXT: localFrameSize: 0 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 2, name: '', type: default, offset: 8, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr7 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr6 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL $sgpr6_sgpr7, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr6_sgpr7, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: wwf_large_stack_small_align +tracksRegLiveness: true +frameInfo: + stackSize: 2056 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 2048, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 2048, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 2052, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: wwf_small_stack_large_align +tracksRegLiveness: true +frameInfo: + stackSize: 12 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_different_stack_shapes +# CHECK-LABEL: name: inline_wwf_different_stack_shapes +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 4 +# CHECK: frameInfo: +# CHECK-NEXT: isFrameAddressTaken: false +# CHECK-NEXT: isReturnAddressTaken: false +# CHECK-NEXT: hasStackMap: false +# CHECK-NEXT: hasPatchPoint: false +# CHECK-NEXT: stackSize: 2060 +# CHECK-NEXT: offsetAdjustment: 0 +# CHECK-NEXT: maxAlignment: 4 +# CHECK-NEXT: adjustsStack: false +# CHECK-NEXT: hasCalls: false +# CHECK-NEXT: stackProtector: '' +# CHECK-NEXT: functionContext: '' +# CHECK-NEXT: maxCallFrameSize: 4294967295 +# CHECK-NEXT: cvBytesOfCalleeSavedRegisters: 0 +# CHECK-NEXT: hasOpaqueSPAdjustment: false +# CHECK-NEXT: hasVAStart: false +# CHECK-NEXT: hasMustTailInVarArgFunc: false +# CHECK-NEXT: hasTailCall: false +# CHECK-NEXT: isCalleeSavedInfoValid: false +# CHECK-NEXT: localFrameSize: 0 +stack: + - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 2056, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_large_stack_small_align + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_large_stack_small_align + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_large_stack_small_align, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_small_stack_large_align + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_small_stack_large_align + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_small_stack_large_align, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: wwf_dyn_stack +tracksRegLiveness: true +frameInfo: + stackSize: 16 + maxAlignment: 4 + adjustsStack: true + hasCalls: false +stack: + - { id: 0, name: local, type: variable-sized, offset: 0, alignment: 1, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: default, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + +body: | + bb.0 (%ir-block.0): + liveins: $sgpr3, $sgpr4, $vgpr0, $vgpr1 + + $sgpr3 = S_MOV_B32 $sgpr33 + $sgpr33 = S_MOV_B32 $sgpr32 + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 16, implicit-def dead $scc + renamable $sgpr1 = S_LSHL_B32 killed renamable $sgpr4, 2, implicit-def dead $scc + renamable $sgpr1 = nuw S_ADD_I32 killed renamable $sgpr1, 15, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $sgpr32 + renamable $sgpr1 = S_AND_B32 killed renamable $sgpr1, -16, implicit-def dead $scc + renamable $vgpr1 = V_ADD_U32_e32 100, $vgpr0, implicit $exec + renamable $sgpr1 = S_LSHL_B32 killed renamable $sgpr1, 5, implicit-def dead $scc + $sgpr32 = S_ADD_I32 renamable $sgpr2, killed renamable $sgpr1, implicit-def dead $scc + SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + $sgpr32 = S_MOV_B32 $sgpr33 + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + $sgpr33 = S_MOV_B32 $sgpr3 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_dyn_stack_callee +tracksRegLiveness: true +frameInfo: + stackSize: 0 + adjustsStack: true + hasCalls: true +# CHECK: frameInfo: +# CHECK: stackSize: 16 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 1 +# CHECK: adjustsStack: true +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +stack: [] +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK isWholeWaveFunction: false +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2 + + $sgpr32 = S_MOV_B32 0 + $vgpr41 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec + renamable $sgpr3 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_dyn_stack + renamable $sgpr2 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_dyn_stack + $sgpr4 = S_MOV_B32 killed $sgpr0 + dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr2_sgpr3, @wwf_dyn_stack, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $vgpr0, implicit-def $vgpr0 + GLOBAL_STORE_DWORD killed renamable $vgpr40_vgpr41, killed renamable $vgpr0, 0, 0, implicit $exec :: (store (s32) into %ir.output, addrspace 1) + S_ENDPGM 0 +... +--- +name: wwf_with_stack_args +tracksRegLiveness: true +frameInfo: + stackSize: 16 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 4 + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr0, implicit $exec + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_stack_args +# CHECK-LABEL: name: inline_wwf_with_stack_args +tracksRegLiveness: true +frameInfo: + stackSize: 0 + adjustsStack: true + hasCalls: true +# CHECK: frameInfo: +# CHECK: stackSize: 16 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 1 +# CHECK: adjustsStack: true +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +fixedStack: [] +stack: [] +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK scavengeFI: '%stack.0' +# CHECK isWholeWaveFunction: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + $sgpr32 = S_MOV_B32 0 + $vgpr41 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack, align 16, addrspace 5) + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_MOV_B32_e32 0, implicit $exec + $vgpr5 = V_MOV_B32_e32 0, implicit $exec + $vgpr6 = V_MOV_B32_e32 0, implicit $exec + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr16 = V_MOV_B32_e32 0, implicit $exec + $vgpr17 = V_MOV_B32_e32 0, implicit $exec + $vgpr18 = V_MOV_B32_e32 0, implicit $exec + $vgpr19 = V_MOV_B32_e32 0, implicit $exec + $vgpr20 = V_MOV_B32_e32 0, implicit $exec + $vgpr21 = V_MOV_B32_e32 0, implicit $exec + $vgpr22 = V_MOV_B32_e32 0, implicit $exec + $vgpr23 = V_MOV_B32_e32 0, implicit $exec + $vgpr24 = V_MOV_B32_e32 0, implicit $exec + $vgpr25 = V_MOV_B32_e32 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 0, implicit $exec + $vgpr27 = V_MOV_B32_e32 0, implicit $exec + $vgpr28 = V_MOV_B32_e32 0, implicit $exec + $vgpr29 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + $vgpr31 = V_MOV_B32_e32 0, implicit $exec + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_stack_args + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_stack_args + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_stack_args, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31, implicit-def $vgpr0 + GLOBAL_STORE_DWORD killed renamable $vgpr40_vgpr41, killed renamable $vgpr0, 0, 0, implicit $exec :: (store (s32) into %ir.output, addrspace 1) + S_ENDPGM 0 +... +--- +name: wwf_with_calls +tracksRegLiveness: true +frameInfo: + stackSize: 8 + hasCalls: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.1' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr2_sgpr3 + + $sgpr25 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr2_sgpr3, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + $exec_lo = S_XOR_B32 $sgpr25, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr25 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_calls +# CHECK-LABEL: name: inline_wwf_with_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true +# CHECK: frameInfo: +# CHECK: hasCalls: true + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: inline_wwf_without_calls +# CHECK-LABEL: name: inline_wwf_without_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true +# CHECK: frameInfo: +# CHECK: hasCalls: true + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wont_inline + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wont_inline + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wont_inline, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll index 586f621ba133c..18e28ba50c3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll @@ -263,6 +263,129 @@ define amdgpu_cs void @inline_wwf_that_realigns_stack(i32 %y) { ret void } +define amdgpu_gfx_whole_wave i32 @wwf_with_dynamic_alloca(i1 %active, i32 inreg %size, i32 %value) { + %dynamic_array = alloca i32, i32 %size, addrspace(5) + store volatile i32 %value, ptr addrspace(5) %dynamic_array + %result = add i32 %value, 100 + ret i32 %result +} + +define amdgpu_cs void @inline_wwf_with_dynamic_alloca(i32 inreg %array_size, i32 %val, ptr addrspace(1) %output) { +; CHECK-LABEL: inline_wwf_with_dynamic_alloca: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_mov_b32 s3, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v1, s33 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, -1 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x64, v0 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: s_lshl2_add_u32 s1, s4, 15 +; CHECK-NEXT: s_mov_b32 s2, s32 +; CHECK-NEXT: s_and_b32 s1, s1, -16 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s2 scope:SCOPE_SYS +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: s_lshl_b32 s1, s1, 5 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_add_co_i32 s32, s2, s1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_b32 exec_lo, s0, -1 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s33 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_mov_b32 s33, s3 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: global_store_b32 v[40:41], v0, off +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_with_dynamic_alloca, i32 inreg %array_size, i32 %val) + store i32 %result, ptr addrspace(1) %output + ret void +} + +define amdgpu_gfx_whole_wave i32 @wwf_with_stack_args(i1 %active, <33 x i32> %vec) { + %elem0 = extractelement <33 x i32> %vec, i32 0 + %elem32 = extractelement <33 x i32> %vec, i32 32 + %sum = add i32 %elem0, %elem32 + ret i32 %sum +} + +define amdgpu_cs void @inline_wwf_with_stack_args(i32 %x, i32 %y, ptr addrspace(1) %output) { +; CHECK-LABEL: inline_wwf_with_stack_args: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: v_dual_mov_b32 v41, v3 :: v_dual_mov_b32 v40, v2 +; CHECK-NEXT: scratch_store_b32 off, v1, s32 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; CHECK-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, 0 +; CHECK-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0 +; CHECK-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0 +; CHECK-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0 +; CHECK-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 0 +; CHECK-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 0 +; CHECK-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 0 +; CHECK-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 0 +; CHECK-NEXT: v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v20, 0 +; CHECK-NEXT: v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v22, 0 +; CHECK-NEXT: v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v24, 0 +; CHECK-NEXT: v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v26, 0 +; CHECK-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v28, 0 +; CHECK-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v30, 0 +; CHECK-NEXT: v_mov_b32_e32 v31, 0 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:8 scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, -1 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s32 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_xor_b32 exec_lo, s0, -1 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s32 offset:8 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: global_store_b32 v[40:41], v0, off +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %vec = insertelement <33 x i32> zeroinitializer, i32 %x, i32 0 + %vec2 = insertelement <33 x i32> %vec, i32 %y, i32 32 + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_with_stack_args, <33 x i32> %vec2) + store i32 %result, ptr addrspace(1) %output + ret void +} + ; Regular function (not whole wave) - should not be inlined define amdgpu_gfx i32 @regular_function(i32 %x) { ; CHECK-LABEL: regular_function: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir index 6382f7e2abcf0..1ac90e47fd984 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir @@ -8,6 +8,15 @@ define amdgpu_cs void @inline_multiple_wwf(i32 %x, i32 %y, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { ret void } define amdgpu_gfx_whole_wave i32 @another_whole_wave_func(i1 %active, i32 %a, i32 %b) { ret i32 0 } + define amdgpu_cs void @inline_wwf_with_local(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + define amdgpu_gfx_whole_wave i32 @whole_wave_func_with_local(i1 %mask, i32 %x) { + %local = alloca i32, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @dont_inline_non_wwf(i32 %input, ptr addrspace(1) %output) { ret void } define amdgpu_gfx i32 @regular_function(i32 %x) { ret i32 0 } ... @@ -267,6 +276,96 @@ body: | S_ENDPGM 0 ... --- +name: inline_wwf_with_local +tracksRegLiveness: true +frameInfo: + hasCalls: true +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: inline_wwf_with_local + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr32 = S_MOV_B32 16 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + ; CHECK-NEXT: S_ENDPGM 0 + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @whole_wave_func_with_local + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @whole_wave_func_with_local + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @whole_wave_func_with_local, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + S_ENDPGM 0 +... +--- +name: whole_wave_func_with_local +tracksRegLiveness: true +frameInfo: + stackSize: 16 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr1 + + ; CHECK-NOT: name: whole_wave_func_with_local + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- name: dont_inline_non_wwf alignment: 1 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll new file mode 100644 index 0000000000000..5fed08723055c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll @@ -0,0 +1,199 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -amdgpu-enable-machine-level-inliner < %s | FileCheck %s --check-prefixes=CHECK + +; CHECK-LABEL: {{^}}cs_shader: +; CHECK: .set cs_shader.num_vgpr, 67{{$}} +; CHECK: .set cs_shader.numbered_sgpr, 67{{$}} +; CHECK: .set cs_shader.private_seg_size, 2064{{$}} +; CHECK: .set cs_shader.has_dyn_sized_stack, 0{{$}} +; CHECK: .set cs_shader.has_recursion, 0{{$}} +; CHECK: .set cs_shader.has_indirect_call, 0{{$}} +; CHECK-LABEL: {{^}}ps_shader: +; CHECK: .set ps_shader.num_vgpr, 1{{$}} +; CHECK: .set ps_shader.numbered_sgpr, 34{{$}} +; CHECK: .set ps_shader.private_seg_size, 16{{$}} +; CHECK: .set ps_shader.has_dyn_sized_stack, 1{{$}} +; CHECK: .set ps_shader.has_recursion, 0{{$}} +; CHECK: .set ps_shader.has_indirect_call, 0{{$}} +; CHECK-LABEL: {{^}}gs_shader: +; CHECK: .set gs_shader.num_vgpr, max(248, amdgpu.max_num_vgpr) +; CHECK: .set gs_shader.numbered_sgpr, max(96, amdgpu.max_num_sgpr) +; CHECK: .set gs_shader.private_seg_size, 592{{$}} +; CHECK: .set gs_shader.has_dyn_sized_stack, 1{{$}} +; CHECK: .set gs_shader.has_recursion, 1{{$}} +; CHECK: .set gs_shader.has_indirect_call, 1{{$}} +; CHECK-LABEL: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1{{$}} +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .ps_extra_lds_size: 0{{$}} +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: cs_shader +; CHECK-NEXT: .excp_en: 0{{$}} +; CHECK-NEXT: .float_mode: 0xc0{{$}} +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x810{{$}} +; CHECK-NEXT: .sgpr_count: 0x45{{$}} +; CHECK-NEXT: .sgpr_limit: 0x6a{{$}} +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1{{$}} +; CHECK-NEXT: - 0x400{{$}} +; CHECK-NEXT: - 0x1{{$}} +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3{{$}} +; CHECK-NEXT: .vgpr_count: 0x43{{$}} +; CHECK-NEXT: .vgpr_limit: 0x100{{$}} +; CHECK-NEXT: .wavefront_size: 0x40{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .gs: +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x250{{$}} +; CHECK-NEXT: .sgpr_count: 0x62{{$}} +; CHECK-NEXT: .vgpr_count: 0xf8{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .ps: +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x10{{$}} +; CHECK-NEXT: .sgpr_count: 0x24{{$}} +; CHECK-NEXT: .vgpr_count: 0x2{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3{{$}} +; CHECK-NEXT: - 0x6{{$}} +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +; Callee with high VGPR, SGPR and stack usage. The PAL metadata should reflect this. +define amdgpu_gfx_whole_wave i32 @wwf(i1 %active, i32 %x) { + call void asm sideeffect "; touch high VGPR and SGPR", "~{v66},~{s66}"() + %temp = alloca i32, align 1024, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 1024 + %result = add i32 %x, 42 + ret i32 %result +} + +define amdgpu_cs void @cs_shader(i32 %y) { + %local = alloca i32, addrspace(5) + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf, i32 %y) + %storable = mul i32 %result, %y + store volatile i32 %storable, ptr addrspace(5) %local + ret void +} + +; Test that dynamic stack allocations in the callee are reported for the caller. +define amdgpu_gfx_whole_wave void @wwf_dyn_stack(i1 %active, i32 inreg %size, i32 %x) { + %temp = alloca i32, i32 %size, addrspace(5) + store volatile i32 %x, ptr addrspace(5) %temp + ret void +} + +define amdgpu_ps void @ps_shader() #1 { + call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_dyn_stack, i32 inreg 12, i32 121) + ret void +} + +; Test that indirect calls in the callee are reported for the caller. +define amdgpu_gfx_whole_wave void @wwf_indirect(i1 %active, ptr inreg %func_ptr, i32 %x) { + call void(i32) %func_ptr(i32 %x) + ret void +} + +define amdgpu_gs void @gs_shader(ptr inreg %func_ptr) { + call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_indirect, ptr inreg %func_ptr, i32 42) + ret void +} + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\06"} From 05167263fa2e4e9e30d1aa88b5f4fb3a2f605966 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Tue, 25 Nov 2025 11:05:28 +0100 Subject: [PATCH 3/3] Clang format... Please ignore --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 040553d6ea7da..9f42a3e8ae922 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1291,9 +1291,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm((Alignment - 1) * ST.getScratchScaleFactor()) .setMIFlag(MachineInstr::FrameSetup); auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) - .addReg(FramePtrReg, RegState::Kill) - .addImm(-Alignment * ST.getScratchScaleFactor()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(FramePtrReg, RegState::Kill) + .addImm(-Alignment * ST.getScratchScaleFactor()) + .setMIFlag(MachineInstr::FrameSetup); And->getOperand(3).setIsDead(); // Mark SCC as dead. FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { @@ -1322,9 +1322,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (HasFP && RoundedSize != 0) { auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getScratchScaleFactor()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getScratchScaleFactor()) + .setMIFlag(MachineInstr::FrameSetup); Add->getOperand(3).setIsDead(); // Mark SCC as dead. }