Skip to content

Commit

Permalink
AMDGPU add support for spilling to a user sgpr pointed buffers
Browse files Browse the repository at this point in the history
Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000
  • Loading branch information
tstellarAMD committed Jan 25, 2017
1 parent e04da5d commit 2f3f985
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 35 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Expand Up @@ -100,6 +100,10 @@ def int_amdgcn_dispatch_id :
GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;

def int_amdgcn_implicit_buffer_ptr :
GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;

//===----------------------------------------------------------------------===//
// Instruction Intrinsics
//===----------------------------------------------------------------------===//
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Expand Up @@ -140,7 +140,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
if (STM.isAmdCodeObjectV2()) {
if (STM.isAmdCodeObjectV2(*MF)) {
getSIProgramInfo(KernelInfo, *MF);
EmitAmdKernelCodeT(*MF, KernelInfo);
}
Expand All @@ -149,7 +149,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
SmallString<128> SymbolName;
Expand Down Expand Up @@ -779,7 +779,7 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,

// FIXME: Should use getKernArgSize
header.kernarg_segment_byte_size =
STM.getKernArgSegmentSize(MFI->getABIArgOffset());
STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -299,8 +299,9 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}

unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
unsigned ImplicitBytes = getImplicitArgNumBytes();
unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
unsigned ExplicitArgBytes) const {
unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
if (ImplicitBytes == 0)
return ExplicitArgBytes;

Expand Down
23 changes: 16 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Expand Up @@ -313,22 +313,31 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
return EnableXNACK;
}

bool isAmdCodeObjectV2() const {
return isAmdHsaOS() || isMesa3DOS();
bool isMesaKernel(const MachineFunction &MF) const {
return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
}

// Covers VS/PS/CS graphics shaders
bool isMesaGfxShader(const MachineFunction &MF) const {
return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
}

bool isAmdCodeObjectV2(const MachineFunction &MF) const {
return isAmdHsaOS() || isMesaKernel(MF);
}

/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset() const {
return isAmdCodeObjectV2() ? 0 : 36;
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
return isAmdCodeObjectV2(MF) ? 0 : 36;
}

unsigned getAlignmentForImplicitArgPtr() const {
return isAmdHsaOS() ? 8 : 4;
}

unsigned getImplicitArgNumBytes() const {
if (isMesa3DOS())
unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
if (isMesaKernel(MF))
return 16;
if (isAmdHsaOS() && isOpenCLEnv())
return 32;
Expand Down Expand Up @@ -595,7 +604,7 @@ class SISubtarget final : public AMDGPUSubtarget {
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}

unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
Expand Up @@ -1587,7 +1587,7 @@ SDValue R600TargetLowering::LowerFormalArguments(

unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();

MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
Expand Down
58 changes: 46 additions & 12 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Expand Up @@ -237,7 +237,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,


unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2()) {
if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
Expand All @@ -255,7 +255,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}

if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
assert(ST.isAmdCodeObjectV2());
assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
Expand All @@ -280,6 +280,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,

bool CopyBuffer = ResourceRegUsed &&
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
ST.isAmdCodeObjectV2(MF) &&
ScratchRsrcReg != PreloadedPrivateBufferReg;

// This needs to be careful of the copying order to avoid overwriting one of
Expand All @@ -303,24 +304,57 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}

if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
assert(!ST.isAmdCodeObjectV2());
if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
assert(!ST.isAmdCodeObjectV2(MF));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);

// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);

BuildMI(MBB, I, DL, SMovB32, Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
if (MFI->hasPrivateMemoryInputPtr()) {
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);

if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);

BuildMI(MBB, I, DL, Mov64, Rsrc01)
.addReg(PreloadedPrivateBufferReg)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else {
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);

PointerType *PtrTy =
PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
auto MMO = MF.getMachineMemOperand(PtrInfo,
MachineMemOperand::MOLoad |
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
0, 0);
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(PreloadedPrivateBufferReg)
.addImm(0) // offset
.addImm(0) // glc
.addMemOperand(MMO)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
}
} else {
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);

BuildMI(MBB, I, DL, SMovB32, Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);

BuildMI(MBB, I, DL, SMovB32, Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);

}

BuildMI(MBB, I, DL, SMovB32, Rsrc2)
.addImm(Rsrc23 & 0xffffffff)
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -891,14 +891,20 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!AMDGPU::isShader(CallConv)) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
assert(!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
!Info->hasWorkItemIDZ());
}

if (Info->hasPrivateMemoryInputPtr()) {
unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
CCInfo.AllocateReg(PrivateMemoryPtrReg);
}

// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info->hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
Expand Down Expand Up @@ -956,7 +962,7 @@ SDValue SITargetLowering::LowerFormalArguments(
if (VA.isMemLoc()) {
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
Expand Down Expand Up @@ -1080,7 +1086,7 @@ SDValue SITargetLowering::LowerFormalArguments(
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;

if (ST.isAmdCodeObjectV2()) {
if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
Expand Down Expand Up @@ -2504,9 +2510,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// TODO: Should this propagate fast-math-flags?

switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
if (!Subtarget->isAmdCodeObjectV2()) {
if (!Subtarget->isAmdCodeObjectV2(MF)) {
DiagnosticInfoUnsupported BadIntrin(
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
Expand Down
15 changes: 13 additions & 2 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Expand Up @@ -77,7 +77,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PrivateSegmentWaveByteOffset(false),
WorkItemIDX(false),
WorkItemIDY(false),
WorkItemIDZ(false) {
WorkItemIDZ(false),
PrivateMemoryInputPtr(false) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();

Expand Down Expand Up @@ -114,7 +115,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects || MaySpill)
PrivateSegmentWaveByteOffset = true;

if (ST.isAmdCodeObjectV2()) {
if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;

Expand All @@ -126,6 +127,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)

if (F->hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
} else if (ST.isMesaGfxShader(MF)) {
if (HasStackObjects || MaySpill)
PrivateMemoryInputPtr = true;
}

// We don't need to worry about accessing spills with flat instructions.
Expand Down Expand Up @@ -182,6 +186,13 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return FlatScratchInitUserSGPR;
}

unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
return PrivateMemoryPtrUserSGPR;
}

SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
MachineFunction *MF,
unsigned FrameIndex,
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Expand Up @@ -87,6 +87,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned ScratchRSrcReg;
unsigned ScratchWaveOffsetReg;

// Input registers for non-HSA ABI
unsigned PrivateMemoryPtrUserSGPR;

// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
unsigned PrivateSegmentBufferUserSGPR;
Expand Down Expand Up @@ -166,6 +169,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;

// Private memory buffer
// Compute directly in sgpr[0:1]
// Other shaders indirect 64-bits at sgpr[0:1]
bool PrivateMemoryInputPtr : 1;

MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
Expand Down Expand Up @@ -204,6 +212,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
unsigned addDispatchID(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);

// Add system SGPRs.
unsigned addWorkGroupIDX() {
Expand Down Expand Up @@ -308,6 +317,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
return WorkItemIDZ;
}

bool hasPrivateMemoryInputPtr() const {
return PrivateMemoryInputPtr;
}

unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
Expand Down Expand Up @@ -344,6 +357,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
return QueuePtrUserSGPR;
}

unsigned getPrivateMemoryPtrUserSGPR() const {
return PrivateMemoryPtrUserSGPR;
}

bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
}
Expand Down
10 changes: 6 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Expand Up @@ -1108,10 +1108,12 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
assert(ST.isAmdCodeObjectV2() &&
"Non-CodeObjectV2 ABI currently uses relocations");
assert(MFI->hasPrivateSegmentBuffer());
return MFI->PrivateSegmentBufferUserSGPR;
if (ST.isAmdCodeObjectV2(MF)) {
assert(MFI->hasPrivateSegmentBuffer());
return MFI->PrivateSegmentBufferUserSGPR;
}
assert(MFI->hasPrivateMemoryInputPtr());
return MFI->PrivateMemoryPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
Expand Down

0 comments on commit 2f3f985

Please sign in to comment.