Skip to content

Commit

Permalink
[AMDGPU] Add amdgpu_gfx calling convention
Browse files Browse the repository at this point in the history
Add a calling convention called amdgpu_gfx for real function calls
within graphics shaders. For the moment, this uses the same calling
convention as other calls in amdgpu, with registers excluded for return
address, stack pointer and stack buffer descriptor.

Differential Revision: https://reviews.llvm.org/D88540
  • Loading branch information
Flakebi committed Nov 9, 2020
1 parent d093401 commit a022b1c
Show file tree
Hide file tree
Showing 30 changed files with 8,325 additions and 102 deletions.
2 changes: 1 addition & 1 deletion lld/test/ELF/lto/amdgcn-oses.ll
Expand Up @@ -33,7 +33,7 @@ define void @_start() {
target triple = "amdgcn-amd-amdpal"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

define void @_start() {
define amdgpu_cs void @_start() {
ret void
}

Expand Down
19 changes: 19 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Expand Up @@ -8116,6 +8116,25 @@ The following table illustrates the required format:
the top 32 bits of the pipeline, so the shader may use the program
counter's top 32 bits.

.. _pal_call-convention:

Call Convention
~~~~~~~~~~~~~~~

For graphics use cases, the calling convention is `amdgpu_gfx`.

.. note::

`amdgpu_gfx` Function calls are currently in development and are
subject to major changes.

This calling convention shares most properties with calling non-kernel
functions (see
:ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions`).
Differences are:

- Currently there are none, differences will be listed here

Unspecified OS
--------------

Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/CallingConv.h
Expand Up @@ -241,6 +241,9 @@ namespace CallingConv {
/// The remainder matches the regular calling convention.
WASM_EmscriptenInvoke = 99,

/// Calling convention used for AMD graphics targets.
AMDGPU_Gfx = 100,

/// The highest possible calling convention ID. Must be some 2^k - 1.
MaxID = 1023
};
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/AsmParser/LLLexer.cpp
Expand Up @@ -624,6 +624,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_ps);
KEYWORD(amdgpu_cs);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
KEYWORD(tailcc);

KEYWORD(cc);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/AsmParser/LLParser.cpp
Expand Up @@ -2134,6 +2134,7 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break;
case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
case lltok::kw_amdgpu_vs: CC = CallingConv::AMDGPU_VS; break;
case lltok::kw_amdgpu_gfx: CC = CallingConv::AMDGPU_Gfx; break;
case lltok::kw_amdgpu_ls: CC = CallingConv::AMDGPU_LS; break;
case lltok::kw_amdgpu_hs: CC = CallingConv::AMDGPU_HS; break;
case lltok::kw_amdgpu_es: CC = CallingConv::AMDGPU_ES; break;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/AsmParser/LLToken.h
Expand Up @@ -170,6 +170,7 @@ enum Kind {
kw_amdgpu_ps,
kw_amdgpu_cs,
kw_amdgpu_kernel,
kw_amdgpu_gfx,
kw_tailcc,

// Attributes:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/IR/AsmWriter.cpp
Expand Up @@ -399,6 +399,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
case CallingConv::AMDGPU_PS: Out << "amdgpu_ps"; break;
case CallingConv::AMDGPU_CS: Out << "amdgpu_cs"; break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
}
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Expand Up @@ -456,7 +456,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Info = analyzeResourceUsage(MF);
}

if (STM.isAmdPalOS())
if (STM.isAmdPalOS() && MFI->isEntryFunction())
EmitPALMetadata(MF, CurrentProgramInfo);
else if (!STM.isAmdHsaOS()) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
Expand Down
24 changes: 13 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Expand Up @@ -460,8 +460,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,

CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
const bool IsShader = AMDGPU::isShader(CC);
const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
AMDGPU::isKernel(CC);
const bool IsWaveEnd =
(IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
if (IsWaveEnd) {
B.buildInstr(AMDGPU::S_ENDPGM)
.addImm(0);
Expand Down Expand Up @@ -785,7 +785,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (CC == CallingConv::AMDGPU_KERNEL)
return lowerFormalArgumentsKernel(B, F, VRegs);

const bool IsShader = AMDGPU::isShader(CC);
const bool IsGraphics = AMDGPU::isGraphics(CC);
const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);

MachineFunction &MF = B.getMF();
Expand Down Expand Up @@ -826,7 +826,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const bool InReg = Arg.hasAttribute(Attribute::InReg);

// SGPR arguments to functions not implemented.
if (!IsShader && InReg)
if (!IsGraphics && InReg)
return false;

if (Arg.hasAttribute(Attribute::SwiftSelf) ||
Expand Down Expand Up @@ -937,7 +937,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(

// Start adding system SGPRs.
if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Expand Down Expand Up @@ -1131,11 +1131,6 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst,

bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
return false;
}

if (Info.IsVarArg) {
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
return false;
Expand All @@ -1149,8 +1144,15 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI = MF.getRegInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
CallingConv::ID CallConv = F.getCallingConv();

if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
CallConv != CallingConv::AMDGPU_Gfx) {
LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
return false;
}

if (AMDGPU::isShader(F.getCallingConv())) {
if (AMDGPU::isShader(CallConv)) {
LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
return false;
}
Expand Down
72 changes: 70 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Expand Up @@ -16,7 +16,75 @@ class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;

// Calling convention for SI
def CC_SI : CallingConv<[
def CC_SI_Gfx : CallingConv<[
// 0-3 are reserved for the stack buffer descriptor
// 30-31 are reserved for the return address
// 32 is reserved for the stack pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
]>>>,

CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
]>>>,

CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;

def RetCC_SI_Gfx : CallingConv<[
// 0-3 are reserved for the stack buffer descriptor
// 32 is reserved for the stack pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
SGPR40, SGPR41, SGPR42, SGPR43
]>>>,

CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
]>>>,

CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;

def CC_SI_SHADER : CallingConv<[

CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
Expand Down Expand Up @@ -161,7 +229,7 @@ def CC_AMDGPU : CallingConv<[
CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
CCDelegateTo<CC_SI_SHADER>>,
CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -942,6 +942,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
default:
Expand All @@ -963,6 +965,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
Expand Down
27 changes: 1 addition & 26 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Expand Up @@ -829,31 +829,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}

static bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();

// Arguments to compute shaders are never a source of divergence.
CallingConv::ID CC = F->getCallingConv();
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return true;
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
// For non-compute shaders, SGPR inputs are marked with either inreg.
// Everything else is in VGPRs.
return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
default:
// TODO: Should calls support inreg for SGPR inputs?
return false;
}
}

/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
/// this is analyzing the collective result of all output registers. Otherwise,
/// this is only querying a specific result index if this returns multiple
Expand Down Expand Up @@ -910,7 +885,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
/// different across workitems in a wavefront.
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
return !AMDGPU::isArgPassedInSGPR(A);

// Loads from the private and flat address spaces are divergent, because
// threads can execute the load instruction with the same inputs and get
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Expand Up @@ -76,7 +76,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
const GCNSubtarget *ST;
const SITargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
bool IsGraphics;
bool HasFP32Denormals;
bool HasFP64FP16Denormals;
unsigned MaxVGPRs;
Expand Down Expand Up @@ -142,7 +142,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
: BaseT(TM, F.getParent()->getDataLayout()),
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
MaxVGPRs(ST->getMaxNumVGPRs(
std::max(ST->getWavesPerEU(F).first,
ST->getWavesPerEUForWorkGroup(
Expand Down Expand Up @@ -222,7 +222,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
unsigned getFlatAddressSpace() const {
// Don't bother running InferAddressSpaces pass on graphics shaders which
// don't use flat addressing.
if (IsGraphicsShader)
if (IsGraphics)
return -1;
return AMDGPUAS::FLAT_ADDRESS;
}
Expand Down

0 comments on commit a022b1c

Please sign in to comment.