Skip to content

Commit

Permalink
AMDGPU: Support non-entry block static sized allocas
Browse files Browse the repository at this point in the history
OpenMP emits these for some reason, so handle them. Assume these use
4096 bytes by default, with a flag to override this. Also change the
related stack assumption for calls to have a flag.
  • Loading branch information
arsenm committed May 27, 2020
1 parent dda8298 commit 5e007fe
Show file tree
Hide file tree
Showing 4 changed files with 362 additions and 2 deletions.
27 changes: 25 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Expand Up @@ -49,6 +49,22 @@ using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::AMDGPU::HSAMD;

// We need to tell the runtime some amount ahead of time if we don't know the
// true stack size. Assume a smaller number if this is only due to dynamic /
// non-entry block allocas.
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
"amdgpu-assume-external-call-stack-size",
cl::desc("Assumed stack use of any external call (in bytes)"),
cl::Hidden,
cl::init(16384));

static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
"amdgpu-assume-dynamic-stack-object-size",
cl::desc("Assumed extra stack use if there are any "
"variable sized objects (in bytes)"),
cl::Hidden,
cl::init(4096));

// This should get the default rounding mode from the kernel. We just set the
// default here, but this could change if the OpenCL rounding mode pragmas are
// used.
Expand Down Expand Up @@ -637,8 +653,13 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.UsesFlatScratch = false;
}

Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();

// Assume a big number if there are any unknown sized objects.
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
if (Info.HasDynamicallySizedStack)
Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;

if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

Expand Down Expand Up @@ -907,7 +928,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
MaxVGPR = std::max(MaxVGPR, 23);
MaxAGPR = std::max(MaxAGPR, 23);

CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
CalleeFrameSize = std::max(CalleeFrameSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));

Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
Expand Down
63 changes: 63 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -3089,6 +3089,67 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
IsThisReturn ? OutVals[0] : SDValue());
}

// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
SDValue Op, SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

SDLoc dl(Op);
EVT VT = Op.getValueType();
SDValue Tmp1 = Op;
SDValue Tmp2 = Op.getValue(1);
SDValue Tmp3 = Op.getOperand(2);
SDValue Chain = Tmp1.getOperand(0);

Register SPReg = Info->getStackPtrOffsetReg();

// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const TargetFrameLowering *TFL = ST.getFrameLowering();
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
ISD::ADD : ISD::SUB;

SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));

unsigned StackAlign = TFL->getStackAlignment();
Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
if (Align > StackAlign)
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Tmp2 = DAG.getCALLSEQ_END(
Chain, DAG.getIntPtrConstant(0, dl, true),
DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

return DAG.getMergeValues({Tmp1, Tmp2}, dl);
}

SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
// We only handle constant sizes here to allow non-entry block, static sized
// allocas. A truly dynamic value is more difficult to support because we
// don't know if the size value is uniform or not. If the size isn't uniform,
// we would need to do a wave reduction to get the maximum size to know how
// much to increment the uniform stack pointer.
SDValue Size = Op.getOperand(1);
if (isa<ConstantSDNode>(Size))
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.

return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
}

Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
Expand Down Expand Up @@ -4305,6 +4366,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return SDValue();
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Expand Up @@ -337,6 +337,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;

SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;

Expand Down

0 comments on commit 5e007fe

Please sign in to comment.