Skip to content

Commit

Permalink
Add action builder for HIP
Browse files Browse the repository at this point in the history
To support separate compile/link and linking across device IR in different source files,
a new HIP action builder is introduced. Basically it compiles/links host and device
code separately, and embed fat binary in host linking stage through linker script.

Differential Revision: https://reviews.llvm.org/D46476

llvm-svn: 333483
  • Loading branch information
yxsamliu committed May 30, 2018
1 parent 6ca999b commit 3af038b
Show file tree
Hide file tree
Showing 2 changed files with 428 additions and 252 deletions.
332 changes: 218 additions & 114 deletions clang/lib/Driver/Driver.cpp
Expand Up @@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
}
};

/// CUDA action builder. It injects device code in the host backend
/// action.
class CudaActionBuilder final : public DeviceActionBuilder {
/// Base class for CUDA/HIP action builder. It injects device code in
/// the host backend action.
class CudaActionBuilderBase : public DeviceActionBuilder {
protected:
/// Flags to signal if the user requested host-only or device-only
/// compilation.
bool CompileHostOnly = false;
Expand All @@ -2170,115 +2171,11 @@ class OffloadingActionBuilder final {

/// Flag that is set to true if this builder acted on the current input.
bool IsActive = false;

public:
CudaActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}

ActionBuilderReturnCode
getDeviceDependences(OffloadAction::DeviceDependences &DA,
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
if (!IsActive)
return ABRT_Inactive;

// If we don't have more CUDA actions, we don't have any dependences to
// create for the host.
if (CudaDeviceActions.empty())
return ABRT_Success;

assert(CudaDeviceActions.size() == GpuArchList.size() &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");

// If we are generating code for the device or we are in a backend phase,
// we attempt to generate the fat binary. We compile each arch to ptx and
// assemble to cubin, then feed the cubin *and* the ptx into a device
// "link" action, which uses fatbinary to combine these cubins into one
// fatbin. The fatbin is then an input to the host action if not in
// device-only mode.
if (CompileDeviceOnly || CurPhase == phases::Backend) {
ActionList DeviceActions;
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
// Produce the device action from the current phase up to the assemble
// phase.
for (auto Ph : Phases) {
// Skip the phases that were already dealt with.
if (Ph < CurPhase)
continue;
// We have to be consistent with the host final phase.
if (Ph > FinalPhase)
break;

CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);

if (Ph == phases::Assemble)
break;
}

// If we didn't reach the assemble phase, we can't generate the fat
// binary. We don't need to generate the fat binary if we are not in
// device-only mode.
if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
CompileDeviceOnly)
continue;

Action *AssembleAction = CudaDeviceActions[I];
assert(AssembleAction->getType() == types::TY_Object);
assert(AssembleAction->getInputs().size() == 1);

Action *BackendAction = AssembleAction->getInputs()[0];
assert(BackendAction->getType() == types::TY_PP_Asm);

for (auto &A : {AssembleAction, BackendAction}) {
OffloadAction::DeviceDependences DDep;
DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
Action::OFK_Cuda);
DeviceActions.push_back(
C.MakeAction<OffloadAction>(DDep, A->getType()));
}
}

// We generate the fat binary if we have device input actions.
if (!DeviceActions.empty()) {
CudaFatBinary =
C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);

if (!CompileDeviceOnly) {
DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
Action::OFK_Cuda);
// Clear the fat binary, it is already a dependence to an host
// action.
CudaFatBinary = nullptr;
}

// Remove the CUDA actions as they are already connected to an host
// action or fat binary.
CudaDeviceActions.clear();
}

// We avoid creating host action in device-only mode.
return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
} else if (CurPhase > phases::Backend) {
// If we are past the backend phase and still have a device action, we
// don't have to do anything as this action is already a device
// top-level action.
return ABRT_Success;
}

assert(CurPhase < phases::Backend && "Generating single CUDA "
"instructions should only occur "
"before the backend phase!");

// By default, we produce an action for each device arch.
for (Action *&A : CudaDeviceActions)
A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);

return ABRT_Success;
}
CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs,
Action::OffloadKind OFKind)
: DeviceActionBuilder(C, Args, Inputs, OFKind) {}

ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
// While generating code for CUDA, we only depend on the host input action
Expand Down Expand Up @@ -2317,6 +2214,17 @@ class OffloadingActionBuilder final {
return ABRT_Success;
}

// If this is an unbundling action use it as is for each CUDA toolchain.
if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
CudaDeviceActions.clear();
for (auto Arch : GpuArchList) {
CudaDeviceActions.push_back(UA);
UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
AssociatedOffloadKind);
}
return ABRT_Success;
}

return IsActive ? ABRT_Success : ABRT_Inactive;
}

Expand All @@ -2325,7 +2233,7 @@ class OffloadingActionBuilder final {
auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
OffloadAction::DeviceDependences Dep;
Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
Action::OFK_Cuda);
AssociatedOffloadKind);
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
};

Expand Down Expand Up @@ -2354,8 +2262,17 @@ class OffloadingActionBuilder final {
}

bool initialize() override {
assert(AssociatedOffloadKind == Action::OFK_Cuda ||
AssociatedOffloadKind == Action::OFK_HIP);

// We don't need to support CUDA.
if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
if (AssociatedOffloadKind == Action::OFK_Cuda &&
!C.hasOffloadToolChain<Action::OFK_Cuda>())
return false;

// We don't need to support HIP.
if (AssociatedOffloadKind == Action::OFK_HIP &&
!C.hasOffloadToolChain<Action::OFK_HIP>())
return false;

const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
Expand All @@ -2370,7 +2287,10 @@ class OffloadingActionBuilder final {
return true;
}

ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
ToolChains.push_back(
AssociatedOffloadKind == Action::OFK_Cuda
? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
: C.getSingleOffloadToolChain<Action::OFK_HIP>());

Arg *PartialCompilationArg = Args.getLastArg(
options::OPT_cuda_host_only, options::OPT_cuda_device_only,
Expand Down Expand Up @@ -2423,6 +2343,187 @@ class OffloadingActionBuilder final {
}
};

/// \brief CUDA action builder. It injects device code in the host backend
/// action.
class CudaActionBuilder final : public CudaActionBuilderBase {
public:
CudaActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
: CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}

ActionBuilderReturnCode
getDeviceDependences(OffloadAction::DeviceDependences &DA,
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
if (!IsActive)
return ABRT_Inactive;

// If we don't have more CUDA actions, we don't have any dependences to
// create for the host.
if (CudaDeviceActions.empty())
return ABRT_Success;

assert(CudaDeviceActions.size() == GpuArchList.size() &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");

// If we are generating code for the device or we are in a backend phase,
// we attempt to generate the fat binary. We compile each arch to ptx and
// assemble to cubin, then feed the cubin *and* the ptx into a device
// "link" action, which uses fatbinary to combine these cubins into one
// fatbin. The fatbin is then an input to the host action if not in
// device-only mode.
if (CompileDeviceOnly || CurPhase == phases::Backend) {
ActionList DeviceActions;
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
// Produce the device action from the current phase up to the assemble
// phase.
for (auto Ph : Phases) {
// Skip the phases that were already dealt with.
if (Ph < CurPhase)
continue;
// We have to be consistent with the host final phase.
if (Ph > FinalPhase)
break;

CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);

if (Ph == phases::Assemble)
break;
}

// If we didn't reach the assemble phase, we can't generate the fat
// binary. We don't need to generate the fat binary if we are not in
// device-only mode.
if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
CompileDeviceOnly)
continue;

Action *AssembleAction = CudaDeviceActions[I];
assert(AssembleAction->getType() == types::TY_Object);
assert(AssembleAction->getInputs().size() == 1);

Action *BackendAction = AssembleAction->getInputs()[0];
assert(BackendAction->getType() == types::TY_PP_Asm);

for (auto &A : {AssembleAction, BackendAction}) {
OffloadAction::DeviceDependences DDep;
DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
Action::OFK_Cuda);
DeviceActions.push_back(
C.MakeAction<OffloadAction>(DDep, A->getType()));
}
}

// We generate the fat binary if we have device input actions.
if (!DeviceActions.empty()) {
CudaFatBinary =
C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);

if (!CompileDeviceOnly) {
DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
Action::OFK_Cuda);
// Clear the fat binary, it is already a dependence to an host
// action.
CudaFatBinary = nullptr;
}

// Remove the CUDA actions as they are already connected to an host
// action or fat binary.
CudaDeviceActions.clear();
}

// We avoid creating host action in device-only mode.
return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
} else if (CurPhase > phases::Backend) {
// If we are past the backend phase and still have a device action, we
// don't have to do anything as this action is already a device
// top-level action.
return ABRT_Success;
}

assert(CurPhase < phases::Backend && "Generating single CUDA "
"instructions should only occur "
"before the backend phase!");

// By default, we produce an action for each device arch.
for (Action *&A : CudaDeviceActions)
A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);

return ABRT_Success;
}
};
/// \brief HIP action builder. It injects device code in the host backend
/// action.
class HIPActionBuilder final : public CudaActionBuilderBase {
/// The linker inputs obtained for each device arch.
SmallVector<ActionList, 8> DeviceLinkerInputs;

public:
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
: CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}

bool canUseBundlerUnbundler() const override { return true; }

ActionBuilderReturnCode
getDeviceDependences(OffloadAction::DeviceDependences &DA,
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
// amdgcn does not support linking of object files, therefore we skip
// backend and assemble phases to output LLVM IR.
if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
CurPhase == phases::Assemble)
return ABRT_Success;

assert((CurPhase == phases::Link ||
CudaDeviceActions.size() == GpuArchList.size()) &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");

// Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
// This happens to each device action originated from each input file.
// Later on, device actions in DeviceLinkerInputs are used to create
// device link actions in appendLinkDependences and the created device
// link actions are passed to the offload action as device dependence.
if (CurPhase == phases::Link) {
DeviceLinkerInputs.resize(CudaDeviceActions.size());
auto LI = DeviceLinkerInputs.begin();
for (auto *A : CudaDeviceActions) {
LI->push_back(A);
++LI;
}

// We will pass the device action as a host dependence, so we don't
// need to do anything else with them.
CudaDeviceActions.clear();
return ABRT_Success;
}

// By default, we produce an action for each device arch.
for (Action *&A : CudaDeviceActions)
A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
AssociatedOffloadKind);

return ABRT_Success;
}

void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
// Append a new link action for each device.
unsigned I = 0;
for (auto &LI : DeviceLinkerInputs) {
auto *DeviceLinkAction =
C.MakeAction<LinkJobAction>(LI, types::TY_Image);
DA.add(*DeviceLinkAction, *ToolChains[0],
CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
++I;
}
}
};

/// OpenMP action builder. The host bitcode is passed to the device frontend
/// and all the device linked images are passed to the host link phase.
class OpenMPActionBuilder final : public DeviceActionBuilder {
Expand Down Expand Up @@ -2589,6 +2690,9 @@ class OffloadingActionBuilder final {
// Create a specialized builder for CUDA.
SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));

// Create a specialized builder for HIP.
SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));

// Create a specialized builder for OpenMP.
SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));

Expand Down

0 comments on commit 3af038b

Please sign in to comment.