Add action builder for HIP

To support separate compile/link and linking across device IR in different source files, a new HIP action builder is introduced. Basically it compiles/links host and device code separately, and embed fat binary in host linking stage through linker script. Differential Revision: https://reviews.llvm.org/D46476 llvm-svn: 333483
llvm · May 30, 2018 · 3af038b · 3af038b
1 parent 6ca999b
commit 3af038b
Show file tree

Hide file tree

Showing 2 changed files with 428 additions and 252 deletions.
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
@@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
     }
   };
 
-  /// CUDA action builder. It injects device code in the host backend
-  /// action.
-  class CudaActionBuilder final : public DeviceActionBuilder {
+  /// Base class for CUDA/HIP action builder. It injects device code in
+  /// the host backend action.
+  class CudaActionBuilderBase : public DeviceActionBuilder {
+  protected:
     /// Flags to signal if the user requested host-only or device-only
     /// compilation.
     bool CompileHostOnly = false;
@@ -2170,115 +2171,11 @@ class OffloadingActionBuilder final {
 
     /// Flag that is set to true if this builder acted on the current input.
     bool IsActive = false;
-
   public:
-    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
-                      const Driver::InputList &Inputs)
-        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
-
-    ActionBuilderReturnCode
-    getDeviceDependences(OffloadAction::DeviceDependences &DA,
-                         phases::ID CurPhase, phases::ID FinalPhase,
-                         PhasesTy &Phases) override {
-      if (!IsActive)
-        return ABRT_Inactive;
-
-      // If we don't have more CUDA actions, we don't have any dependences to
-      // create for the host.
-      if (CudaDeviceActions.empty())
-        return ABRT_Success;
-
-      assert(CudaDeviceActions.size() == GpuArchList.size() &&
-             "Expecting one action per GPU architecture.");
-      assert(!CompileHostOnly &&
-             "Not expecting CUDA actions in host-only compilation.");
-
-      // If we are generating code for the device or we are in a backend phase,
-      // we attempt to generate the fat binary. We compile each arch to ptx and
-      // assemble to cubin, then feed the cubin *and* the ptx into a device
-      // "link" action, which uses fatbinary to combine these cubins into one
-      // fatbin.  The fatbin is then an input to the host action if not in
-      // device-only mode.
-      if (CompileDeviceOnly || CurPhase == phases::Backend) {
-        ActionList DeviceActions;
-        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-          // Produce the device action from the current phase up to the assemble
-          // phase.
-          for (auto Ph : Phases) {
-            // Skip the phases that were already dealt with.
-            if (Ph < CurPhase)
-              continue;
-            // We have to be consistent with the host final phase.
-            if (Ph > FinalPhase)
-              break;
-
-            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
-                C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
-
-            if (Ph == phases::Assemble)
-              break;
-          }
-
-          // If we didn't reach the assemble phase, we can't generate the fat
-          // binary. We don't need to generate the fat binary if we are not in
-          // device-only mode.
-          if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
-              CompileDeviceOnly)
-            continue;
-
-          Action *AssembleAction = CudaDeviceActions[I];
-          assert(AssembleAction->getType() == types::TY_Object);
-          assert(AssembleAction->getInputs().size() == 1);
-
-          Action *BackendAction = AssembleAction->getInputs()[0];
-          assert(BackendAction->getType() == types::TY_PP_Asm);
-
-          for (auto &A : {AssembleAction, BackendAction}) {
-            OffloadAction::DeviceDependences DDep;
-            DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
-                     Action::OFK_Cuda);
-            DeviceActions.push_back(
-                C.MakeAction<OffloadAction>(DDep, A->getType()));
-          }
-        }
-
-        // We generate the fat binary if we have device input actions.
-        if (!DeviceActions.empty()) {
-          CudaFatBinary =
-              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
-          if (!CompileDeviceOnly) {
-            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
-                   Action::OFK_Cuda);
-            // Clear the fat binary, it is already a dependence to an host
-            // action.
-            CudaFatBinary = nullptr;
-          }
-
-          // Remove the CUDA actions as they are already connected to an host
-          // action or fat binary.
-          CudaDeviceActions.clear();
-        }
-
-        // We avoid creating host action in device-only mode.
-        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
-      } else if (CurPhase > phases::Backend) {
-        // If we are past the backend phase and still have a device action, we
-        // don't have to do anything as this action is already a device
-        // top-level action.
-        return ABRT_Success;
-      }
-
-      assert(CurPhase < phases::Backend && "Generating single CUDA "
-                                           "instructions should only occur "
-                                           "before the backend phase!");
-
-      // By default, we produce an action for each device arch.
-      for (Action *&A : CudaDeviceActions)
-        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
-
-      return ABRT_Success;
-    }
+    CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
+                          const Driver::InputList &Inputs,
+                          Action::OffloadKind OFKind)
+        : DeviceActionBuilder(C, Args, Inputs, OFKind) {}
 
     ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
       // While generating code for CUDA, we only depend on the host input action
@@ -2317,6 +2214,17 @@ class OffloadingActionBuilder final {
         return ABRT_Success;
       }
 
+      // If this is an unbundling action use it as is for each CUDA toolchain.
+      if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
+        CudaDeviceActions.clear();
+        for (auto Arch : GpuArchList) {
+          CudaDeviceActions.push_back(UA);
+          UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
+                                          AssociatedOffloadKind);
+        }
+        return ABRT_Success;
+      }
+
       return IsActive ? ABRT_Success : ABRT_Inactive;
     }
 
@@ -2325,7 +2233,7 @@ class OffloadingActionBuilder final {
       auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
         OffloadAction::DeviceDependences Dep;
         Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
-                Action::OFK_Cuda);
+                AssociatedOffloadKind);
         AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
       };
 
@@ -2354,8 +2262,17 @@ class OffloadingActionBuilder final {
     }
 
     bool initialize() override {
+      assert(AssociatedOffloadKind == Action::OFK_Cuda ||
+             AssociatedOffloadKind == Action::OFK_HIP);
+
       // We don't need to support CUDA.
-      if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+      if (AssociatedOffloadKind == Action::OFK_Cuda &&
+          !C.hasOffloadToolChain<Action::OFK_Cuda>())
+        return false;
+
+      // We don't need to support HIP.
+      if (AssociatedOffloadKind == Action::OFK_HIP &&
+          !C.hasOffloadToolChain<Action::OFK_HIP>())
         return false;
 
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
@@ -2370,7 +2287,10 @@ class OffloadingActionBuilder final {
         return true;
       }
 
-      ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+      ToolChains.push_back(
+          AssociatedOffloadKind == Action::OFK_Cuda
+              ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+              : C.getSingleOffloadToolChain<Action::OFK_HIP>());
 
       Arg *PartialCompilationArg = Args.getLastArg(
           options::OPT_cuda_host_only, options::OPT_cuda_device_only,
@@ -2423,6 +2343,187 @@ class OffloadingActionBuilder final {
     }
   };
 
+  /// \brief CUDA action builder. It injects device code in the host backend
+  /// action.
+  class CudaActionBuilder final : public CudaActionBuilderBase {
+  public:
+    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+                      const Driver::InputList &Inputs)
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}
+
+    ActionBuilderReturnCode
+    getDeviceDependences(OffloadAction::DeviceDependences &DA,
+                         phases::ID CurPhase, phases::ID FinalPhase,
+                         PhasesTy &Phases) override {
+      if (!IsActive)
+        return ABRT_Inactive;
+
+      // If we don't have more CUDA actions, we don't have any dependences to
+      // create for the host.
+      if (CudaDeviceActions.empty())
+        return ABRT_Success;
+
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // If we are generating code for the device or we are in a backend phase,
+      // we attempt to generate the fat binary. We compile each arch to ptx and
+      // assemble to cubin, then feed the cubin *and* the ptx into a device
+      // "link" action, which uses fatbinary to combine these cubins into one
+      // fatbin.  The fatbin is then an input to the host action if not in
+      // device-only mode.
+      if (CompileDeviceOnly || CurPhase == phases::Backend) {
+        ActionList DeviceActions;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Produce the device action from the current phase up to the assemble
+          // phase.
+          for (auto Ph : Phases) {
+            // Skip the phases that were already dealt with.
+            if (Ph < CurPhase)
+              continue;
+            // We have to be consistent with the host final phase.
+            if (Ph > FinalPhase)
+              break;
+
+            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+                C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
+
+            if (Ph == phases::Assemble)
+              break;
+          }
+
+          // If we didn't reach the assemble phase, we can't generate the fat
+          // binary. We don't need to generate the fat binary if we are not in
+          // device-only mode.
+          if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
+              CompileDeviceOnly)
+            continue;
+
+          Action *AssembleAction = CudaDeviceActions[I];
+          assert(AssembleAction->getType() == types::TY_Object);
+          assert(AssembleAction->getInputs().size() == 1);
+
+          Action *BackendAction = AssembleAction->getInputs()[0];
+          assert(BackendAction->getType() == types::TY_PP_Asm);
+
+          for (auto &A : {AssembleAction, BackendAction}) {
+            OffloadAction::DeviceDependences DDep;
+            DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
+                     Action::OFK_Cuda);
+            DeviceActions.push_back(
+                C.MakeAction<OffloadAction>(DDep, A->getType()));
+          }
+        }
+
+        // We generate the fat binary if we have device input actions.
+        if (!DeviceActions.empty()) {
+          CudaFatBinary =
+              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+          if (!CompileDeviceOnly) {
+            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                   Action::OFK_Cuda);
+            // Clear the fat binary, it is already a dependence to an host
+            // action.
+            CudaFatBinary = nullptr;
+          }
+
+          // Remove the CUDA actions as they are already connected to an host
+          // action or fat binary.
+          CudaDeviceActions.clear();
+        }
+
+        // We avoid creating host action in device-only mode.
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+      } else if (CurPhase > phases::Backend) {
+        // If we are past the backend phase and still have a device action, we
+        // don't have to do anything as this action is already a device
+        // top-level action.
+        return ABRT_Success;
+      }
+
+      assert(CurPhase < phases::Backend && "Generating single CUDA "
+                                           "instructions should only occur "
+                                           "before the backend phase!");
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+      return ABRT_Success;
+    }
+  };
+  /// \brief HIP action builder. It injects device code in the host backend
+  /// action.
+  class HIPActionBuilder final : public CudaActionBuilderBase {
+    /// The linker inputs obtained for each device arch.
+    SmallVector<ActionList, 8> DeviceLinkerInputs;
+
+  public:
+    HIPActionBuilder(Compilation &C, DerivedArgList &Args,
+                     const Driver::InputList &Inputs)
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+
+    bool canUseBundlerUnbundler() const override { return true; }
+
+    ActionBuilderReturnCode
+    getDeviceDependences(OffloadAction::DeviceDependences &DA,
+                         phases::ID CurPhase, phases::ID FinalPhase,
+                         PhasesTy &Phases) override {
+      // amdgcn does not support linking of object files, therefore we skip
+      // backend and assemble phases to output LLVM IR.
+      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+          CurPhase == phases::Assemble)
+        return ABRT_Success;
+
+      assert((CurPhase == phases::Link ||
+              CudaDeviceActions.size() == GpuArchList.size()) &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+      // This happens to each device action originated from each input file.
+      // Later on, device actions in DeviceLinkerInputs are used to create
+      // device link actions in appendLinkDependences and the created device
+      // link actions are passed to the offload action as device dependence.
+      if (CurPhase == phases::Link) {
+        DeviceLinkerInputs.resize(CudaDeviceActions.size());
+        auto LI = DeviceLinkerInputs.begin();
+        for (auto *A : CudaDeviceActions) {
+          LI->push_back(A);
+          ++LI;
+        }
+
+        // We will pass the device action as a host dependence, so we don't
+        // need to do anything else with them.
+        CudaDeviceActions.clear();
+        return ABRT_Success;
+      }
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+                                               AssociatedOffloadKind);
+
+      return ABRT_Success;
+    }
+
+    void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
+      // Append a new link action for each device.
+      unsigned I = 0;
+      for (auto &LI : DeviceLinkerInputs) {
+        auto *DeviceLinkAction =
+            C.MakeAction<LinkJobAction>(LI, types::TY_Image);
+        DA.add(*DeviceLinkAction, *ToolChains[0],
+               CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+        ++I;
+      }
+    }
+  };
+
   /// OpenMP action builder. The host bitcode is passed to the device frontend
   /// and all the device linked images are passed to the host link phase.
   class OpenMPActionBuilder final : public DeviceActionBuilder {
@@ -2589,6 +2690,9 @@ class OffloadingActionBuilder final {
     // Create a specialized builder for CUDA.
     SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
 
+    // Create a specialized builder for HIP.
+    SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));
+
     // Create a specialized builder for OpenMP.
     SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));