Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "AMDGPU.h"
#include "AMDGPUMachineModuleInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DenseMap.h"
Expand Down Expand Up @@ -128,15 +129,20 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) {
if (!MFI.hasCalls() && !MFI.hasTailCall())
return false;

MaxInlinedCalleeStackSize = 0;
HasInlinedVarSizedStack = false;

// Collect calls to inline.
SmallVector<MachineInstr *, 4> CallsToInline;
const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();

size_t CallsFound = 0;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (!MI.isCall())
continue;

CallsFound++;
const MachineOperand *CalleeOp =
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
if (CalleeOp && CalleeOp->isGlobal()) {
Expand All @@ -156,6 +162,11 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) {
}
}

// Reset HasCalls if we're about to inline all of them. This will be updated
// further during inlining if any of the callees introduces its own calls.
// FIXME: HasTailCall!
MFI.setHasCalls(CallsFound != CallsToInline.size());

// Perform the actual inlining.
for (MachineInstr *CallMI : CallsToInline) {
const MachineOperand *CalleeOp =
Expand All @@ -176,6 +187,14 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
}

if (Changed) {
if (MaxInlinedCalleeStackSize != 0)
createCalleeStackObject(MFI);

if (HasInlinedVarSizedStack)
MFI.CreateVariableSizedObject(Align(1), /*Alloca=*/nullptr);
}

return Changed;
}

Expand All @@ -184,6 +203,9 @@ void AMDGPUMachineLevelInliner::inlineMachineFunction(MachineFunction *CallerMF,
MachineFunction *CalleeMF,
const SIInstrInfo *TII) {

// TODO: update SIMachineFunctionInfo (e.g. Occupancy)
updateCallerFrameInfo(CallerMF->getFrameInfo(), *CalleeMF);

MachineBasicBlock *CallMBB = CallMI->getParent();
MachineBasicBlock *ContinuationMBB =
CallMBB->splitAt(*CallMI, /*UpdateLiveIns=*/true);
Expand Down Expand Up @@ -287,6 +309,55 @@ void AMDGPUMachineLevelInliner::cleanupAfterInlining(
MI->eraseFromParent();
}

void AMDGPUMachineLevelInliner::updateCallerFrameInfo(
MachineFrameInfo &CallerMFI, const MachineFunction &CalleeMF) {
const MachineFrameInfo &CalleeMFI = CalleeMF.getFrameInfo();
const GCNSubtarget &ST = CalleeMF.getSubtarget<GCNSubtarget>();
const TargetRegisterInfo &TRI = *ST.getRegisterInfo();

// Follow the prologue logic.
uint64_t CalleeStackSize = CalleeMFI.getStackSize();
if (TRI.hasStackRealignment(CalleeMF))
CalleeStackSize += CalleeMFI.getMaxAlign().value();
uint64_t TrueCalleeStackSize = CalleeStackSize * ST.getScratchScaleFactor();

// Only one of the stacks of the callees will
// be active at any given time, so we only need to make sure the largest one
// fits.
MaxInlinedCalleeStackSize =
std::max(MaxInlinedCalleeStackSize, TrueCalleeStackSize);

// Track if any callee has variable-sized stack objects.
if (CalleeMFI.hasVarSizedObjects())
HasInlinedVarSizedStack = true;

#define SET_IF_ANY(SETTER, GETTER) \
CallerMFI.SETTER(CallerMFI.GETTER() || CalleeMFI.GETTER())

SET_IF_ANY(setHasCalls, hasCalls);
SET_IF_ANY(setHasTailCall, hasTailCall);
SET_IF_ANY(setAdjustsStack, adjustsStack);
SET_IF_ANY(setFrameAddressIsTaken, isFrameAddressTaken);
SET_IF_ANY(setReturnAddressIsTaken, isReturnAddressTaken);
SET_IF_ANY(setHasVAStart, hasVAStart);
SET_IF_ANY(setHasMustTailInVarArgFunc, hasMustTailInVarArgFunc);
SET_IF_ANY(setHasOpaqueSPAdjustment, hasOpaqueSPAdjustment);
SET_IF_ANY(setHasCopyImplyingStackAdjustment, hasCopyImplyingStackAdjustment);

#undef SET_IF_ANY
}

void AMDGPUMachineLevelInliner::createCalleeStackObject(
MachineFrameInfo &CallerMFI) {
// Create a stack object representing the maximum callee stack space
uint64_t CallerStackSize = CallerMFI.getStackSize();
int CalleeStackIdx =
CallerMFI.CreateStackObject(MaxInlinedCalleeStackSize, Align(1),
/*isSpillSlot=*/false);
CallerMFI.setObjectOffset(CalleeStackIdx, CallerStackSize);
CallerMFI.setStackSize(CallerStackSize + MaxInlinedCalleeStackSize);
}

FunctionPass *llvm::createAMDGPUMachineLevelInlinerPass() {
return new AMDGPUMachineLevelInliner();
}
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

namespace llvm {

class GCNSubtarget;
class SIInstrInfo;

class AMDGPUMachineLevelInliner : public MachineFunctionPass {
Expand All @@ -52,6 +53,21 @@ class AMDGPUMachineLevelInliner : public MachineFunctionPass {

void cleanupAfterInlining(MachineFunction *CallerMF, MachineInstr *CallMI,
const SIInstrInfo *TII) const;

void updateCallerFrameInfo(MachineFrameInfo &CallerMFI,
const MachineFunction &CalleeMF);

/// Create a stack object representing the stacks of all the inlined callees.
/// Its size will be large enough to accomodate the callee with the largest
/// stack.
void createCalleeStackObject(MachineFrameInfo &CallerMFI);

/// The maximum stack size among all inlined callees (including any padding
/// required to ensure proper alignment).
uint64_t MaxInlinedCalleeStackSize = 0;

/// Whether any inlined callee has variable-sized stack objects.
bool HasInlinedVarSizedStack = false;
};

} // end namespace llvm
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// dynamic realignment in common cases.
Align getStackAlignment() const { return Align(16); }

unsigned getScratchScaleFactor() const {
return enableFlatScratch() ? 1 : getWavefrontSize();
}

bool enableMachineScheduler() const override {
return true;
}
Expand Down
24 changes: 10 additions & 14 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,10 +590,6 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
return ScratchRsrcReg;
}

static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
}

void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
Expand Down Expand Up @@ -693,7 +689,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);

unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
unsigned Offset = FrameInfo.getStackSize() * ST.getScratchScaleFactor();
if (!mayReserveScratchForCWSR(MF)) {
if (hasFP(MF)) {
Register FPReg = MFI->getFrameOffsetReg();
Expand Down Expand Up @@ -1231,7 +1227,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
assert(StackPtrReg != AMDGPU::SP_REG);

BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
.addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
.addImm(MFI.getStackSize() * ST.getScratchScaleFactor());
}
}

Expand Down Expand Up @@ -1292,12 +1288,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// s_and_b32 s33, s33, 0b111...0000
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
.addReg(StackPtrReg)
.addImm((Alignment - 1) * getScratchScaleFactor(ST))
.addImm((Alignment - 1) * ST.getScratchScaleFactor())
.setMIFlag(MachineInstr::FrameSetup);
auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
.addReg(FramePtrReg, RegState::Kill)
.addImm(-Alignment * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
.addReg(FramePtrReg, RegState::Kill)
.addImm(-Alignment * ST.getScratchScaleFactor())
.setMIFlag(MachineInstr::FrameSetup);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
FuncInfo->setIsStackRealigned(true);
} else if ((HasFP = hasFP(MF))) {
Expand Down Expand Up @@ -1326,9 +1322,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,

if (HasFP && RoundedSize != 0) {
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
.addReg(StackPtrReg)
.addImm(RoundedSize * ST.getScratchScaleFactor())
.setMIFlag(MachineInstr::FrameSetup);
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
}

Expand Down Expand Up @@ -2137,7 +2133,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = MFI->getStackPtrOffsetReg();

Amount *= getScratchScaleFactor(ST);
Amount *= ST.getScratchScaleFactor();
if (IsDestroy)
Amount = -Amount;
auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
Expand Down
Loading
Loading