Skip to content

Commit

Permalink
[AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic (#68186)
Browse files Browse the repository at this point in the history
The @llvm.amdgcn.cs.chain intrinsic is essentially a call. The call
parameters are bundled up into 2 intrinsic arguments, one for those that
should go in the SGPRs (the 3rd intrinsic argument), and one for those
that should go in the VGPRs (the 4th intrinsic argument). Both will
often be some kind of aggregate.

Both instruction selection frameworks have some internal representation
for intrinsics (G_INTRINSIC[_WITH_SIDE_EFFECTS] for GlobalISel,
ISD::INTRINSIC_[VOID|WITH_CHAIN] for DAGISel), but we can't use those
because aggregates are dissolved very early on during ISel and we'd lose
the inreg information. Therefore, this patch shortcircuits both the
IRTranslator and SelectionDAGBuilder to lower this intrinsic as a call
from the very start. It tries to use the existing infrastructure as much
as possible, by calling into the code for lowering tail calls.

This has already gone through a few rounds of review in Phab:

Differential Revision: https://reviews.llvm.org/D153761
  • Loading branch information
rovka committed Nov 6, 2023
1 parent 838331a commit 7f5d59b
Show file tree
Hide file tree
Showing 16 changed files with 2,736 additions and 91 deletions.
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
Expand Down Expand Up @@ -2390,6 +2391,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
return CLI->lowerCall(MIRBuilder, Info);
}
case Intrinsic::amdgcn_cs_chain:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

Expand Down
49 changes: 49 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
Expand Down Expand Up @@ -7444,6 +7445,54 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Val);
return;
}
case Intrinsic::amdgcn_cs_chain: {
assert(I.arg_size() == 5 && "Additional args not supported yet");
assert(cast<ConstantInt>(I.getOperand(4))->isZero() &&
"Non-zero flags not supported yet");

// At this point we don't care if it's amdgpu_cs_chain or
// amdgpu_cs_chain_preserve.
CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain;

Type *RetTy = I.getType();
assert(RetTy->isVoidTy() && "Should not return");

SDValue Callee = getValue(I.getOperand(0));

// We only have 2 actual args: one for the SGPRs and one for the VGPRs.
// We'll also tack the value of the EXEC mask at the end.
TargetLowering::ArgListTy Args;
Args.reserve(3);

for (unsigned Idx : {2, 3, 1}) {
TargetLowering::ArgListEntry Arg;
Arg.Node = getValue(I.getOperand(Idx));
Arg.Ty = I.getOperand(Idx)->getType();
Arg.setAttributes(&I, Idx);
Args.push_back(Arg);
}

assert(Args[0].IsInReg && "SGPR args should be marked inreg");
assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
Args[2].IsInReg = true; // EXEC should be inreg

TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(getCurSDLoc())
.setChain(getRoot())
.setCallee(CC, RetTy, Callee, std::move(Args))
.setNoReturn(true)
.setTailCall(true)
.setConvergent(I.isConvergent());
CLI.CB = &I;
std::pair<SDValue, SDValue> Result =
lowerInvokable(CLI, /*EHPadBB*/ nullptr);
(void)Result;
assert(!Result.first.getNode() && !Result.second.getNode() &&
"Should've lowered as tail call");

HasTailCall = true;
return;
}
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
Expand Down
112 changes: 101 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -961,12 +961,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
}

static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall, CallingConv::ID CC) {
assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
"because the address can be divergent");
bool IsTailCall, bool isWave32,
CallingConv::ID CC) {
// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
"Indirect calls can't be tail calls, "
"because the address can be divergent");
if (!IsTailCall)
return AMDGPU::G_SI_CALL;

if (AMDGPU::isChainCC(CC))
return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;

return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
AMDGPU::SI_TCRETURN;
}
Expand Down Expand Up @@ -1154,14 +1160,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
void AMDGPUCallLowering::handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
if (!ST.enableFlatScratch()) {
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
FuncInfo.getScratchRSrcReg());
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);

auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;

MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
}

for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
Expand Down Expand Up @@ -1193,7 +1205,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);

unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
unsigned Opc =
getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
Expand All @@ -1202,8 +1215,27 @@ bool AMDGPUCallLowering::lowerTailCall(
// be 0.
MIB.addImm(0);

// Tell the call which registers are clobbered.
// If this is a chain call, we need to pass in the EXEC mask.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (AMDGPU::isChainCC(Info.CallConv)) {
ArgInfo ExecArg = Info.OrigArgs[1];
assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");

if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
return false;

if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
MIB.addImm(CI->getSExtValue());
} else {
MIB.addReg(ExecArg.Regs[0]);
unsigned Idx = MIB->getNumOperands() - 1;
MIB->getOperand(Idx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
MIB->getDesc(), MIB->getOperand(Idx), Idx));
}
}

// Tell the call which registers are clobbered.
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
MIB.addRegMask(Mask);

Expand Down Expand Up @@ -1257,7 +1289,8 @@ bool AMDGPUCallLowering::lowerTailCall(
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
Expand All @@ -1273,7 +1306,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
return false;

handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
ImplicitArgRegs);

// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
Expand Down Expand Up @@ -1307,8 +1341,62 @@ bool AMDGPUCallLowering::lowerTailCall(
return true;
}

/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
ArgInfo Callee = Info.OrigArgs[0];
ArgInfo SGPRArgs = Info.OrigArgs[2];
ArgInfo VGPRArgs = Info.OrigArgs[3];
ArgInfo Flags = Info.OrigArgs[4];

assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
"Non-zero flags aren't supported yet.");
assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");

MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();

// The function to jump to is actually the first argument, so we'll change the
// Callee and other info to match that before using our existing helper.
const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
if (const Function *F = dyn_cast<Function>(CalleeV)) {
Info.Callee = MachineOperand::CreateGA(F, 0);
Info.CallConv = F->getCallingConv();
} else {
assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
// behaves the same here.
}

// The function that we're calling cannot be vararg (only the intrinsic is).
Info.IsVarArg = false;

assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
"SGPR arguments should be marked inreg");
assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
"VGPR arguments should not be marked inreg");

SmallVector<ArgInfo, 8> OutArgs;
splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);

Info.IsMustTailCall = true;
return lowerTailCall(MIRBuilder, Info, OutArgs);
}

bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
"Unexpected intrinsic");
return lowerChainCall(MIRBuilder, Info);
}

if (Info.IsVarArg) {
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
return false;
Expand Down Expand Up @@ -1357,7 +1445,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
Info.CallConv);

auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.addDef(TRI->getReturnAddressReg(MF));
Expand Down Expand Up @@ -1399,7 +1488,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
ImplicitArgRegs);

// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getStackSize();
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,13 @@ class AMDGPUCallLowering final : public CallLowering {
void handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;

bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
SmallVectorImpl<ArgInfo> &OutArgs) const;
bool lowerChainCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5212,6 +5212,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_GLUE)
NODE_NAME_CASE(WAVE_ADDRESS)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
TC_RETURN_CHAIN,
TRAP,

// Masked control flow nodes.
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
Expand Down

0 comments on commit 7f5d59b

Please sign in to comment.