diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 19da0ad29cd84..dfc5ab904fe9d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -87,7 +87,6 @@ function(_get_hermetic_test_compile_options output_var)
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
-         "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
          -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
          -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
@@ -637,6 +636,7 @@ function(add_integration_test test_name)
   # makes `add_custom_target` construct the correct command and execute it.
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
+      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
       $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
@@ -790,8 +790,7 @@ function(add_libc_hermetic test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     target_link_options(${fq_build_target_name} PRIVATE
@@ -859,6 +858,7 @@ function(add_libc_hermetic test_name)
       string(REPLACE " " ";" test_cmd "${test_cmd_parsed}")
     else()
       set(test_cmd ${HERMETIC_TEST_ENV}
+        $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
         $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
         $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
     endif()
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 48f095d924931..e876629e6d0e1 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -13,6 +13,9 @@
 #include "src/stdlib/atexit.h"
 #include "src/stdlib/exit.h"
 
+// TODO: Merge this and the NVPTX start files once the common `device_kernel`
+// attribute correctly implies `amdgpu_kernel`.
+
 extern "C" int main(int argc, char **argv, char **envp);
 extern "C" void __cxa_finalize(void *dso);
 
@@ -21,45 +24,18 @@ namespace LIBC_NAMESPACE_DECL {
 // FIXME: Factor this out into common logic so we don't need to stub it here.
 void teardown_main_tls() {}
 
-// FIXME: Touch this symbol to force this to be linked in statically.
-volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
-
 DataEnvironment app;
 
-extern "C" uintptr_t __init_array_start[];
-extern "C" uintptr_t __init_array_end[];
-extern "C" uintptr_t __fini_array_start[];
-extern "C" uintptr_t __fini_array_end[];
-
-using InitCallback = void(int, char **, char **);
-using FiniCallback = void(void);
-
-static void call_init_array_callbacks(int argc, char **argv, char **env) {
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
              clang::amdgpu_flat_work_group_size(1, 1),
              clang::amdgpu_max_num_work_groups(1)]] void
-_begin(int argc, char **argv, char **env) {
+_begin(int, char **, char **env) {
+  // The LLVM offloading runtime will automatically call any present global
+  // constructors and destructors so we defer that handling.
   __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
                    reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-  LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index ce8f5bbb6d4eb..822930e22bc26 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -23,48 +23,14 @@ DataEnvironment app;
 // FIXME: Factor this out into common logic so we don't need to stub it here.
 void teardown_main_tls() {}
 
-// FIXME: Touch this symbol to force this to be linked in statically.
-volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
-
-extern "C" {
-// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
-// to manually create them and update the globals in the loader implememtation.
-uintptr_t *__init_array_start [[gnu::visibility("protected")]];
-uintptr_t *__init_array_end [[gnu::visibility("protected")]];
-uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
-uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
-}
-
-// Nvidia requires that the signature of the function pointers match. This means
-// we cannot support the extended constructor arguments.
-using InitCallback = void(void);
-using FiniCallback = void(void);
-
-static void call_init_array_callbacks(int, char **, char **) {
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])();
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env) {
+_begin(int, char **, char **env) {
+  // The LLVM offloading runtime will automatically call any present global
+  // constructors and destructors so we defer that handling.
   __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
                    reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
-
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-  LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
index 729797aa43f0b..b9c5a79849ec8 100644
--- a/llvm/tools/CMakeLists.txt
+++ b/llvm/tools/CMakeLists.txt
@@ -9,10 +9,6 @@
 # traversing each directory.
 create_llvm_tool_options()
 
-if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD)
-  set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
-endif()
-
 if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
   set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
 endif()
diff --git a/llvm/tools/llvm-gpu-loader/CMakeLists.txt b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
index b35a702476ada..de276635e2713 100644
--- a/llvm/tools/llvm-gpu-loader/CMakeLists.txt
+++ b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
@@ -1,46 +1,12 @@
 set(LLVM_LINK_COMPONENTS
-  BinaryFormat
-  Object
   Option
   Support
-  FrontendOffloading
   TargetParser
 )
 
 add_llvm_tool(llvm-gpu-loader
   llvm-gpu-loader.cpp
 
-  # TODO: We intentionally split this currently due to statically linking the
-  #       GPU runtimes. Dynamically load the dependencies, possibly using the
-  #       LLVM offloading API when it is complete.
-  PARTIAL_SOURCES_INTENDED
-
   DEPENDS
   intrinsics_gen
 )
-
-# Locate the RPC server handling interface.
-include(FindLibcCommonUtils)
-target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)
-
-# Check for HSA support for targeting AMD GPUs.
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND)
-  target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
-  target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
-  target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)
-
-  # Compatibility with the old amdhsa-loader name.
-  add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
-endif()
-
-# Check for CUDA support for targeting NVIDIA GPUs.
-find_package(CUDAToolkit 11.2 QUIET)
-if(CUDAToolkit_FOUND)
-  target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
-  target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
-  target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)
-
-  # Compatibility with the old nvptx-loader name.
-  add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
-endif()
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
deleted file mode 100644
index 5715058d8cfac..0000000000000
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ /dev/null
@@ -1,594 +0,0 @@
-//===-- Loader Implementation for AMDHSA devices --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file impelements a simple loader to run images supporting the AMDHSA
-// architecture. The file launches the '_start' kernel which should be provided
-// by the device application start code and call ultimately call the 'main'
-// function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-gpu-loader.h"
-#include "server.h"
-
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-
-#include "llvm/Frontend/Offloading/Utility.h"
-
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <thread>
-#include <tuple>
-#include <utility>
-
-// The implicit arguments of COV5 AMDGPU kernels.
-struct implicit_args_t {
-  uint32_t grid_size_x;
-  uint32_t grid_size_y;
-  uint32_t grid_size_z;
-  uint16_t workgroup_size_x;
-  uint16_t workgroup_size_y;
-  uint16_t workgroup_size_z;
-  uint8_t Unused0[46];
-  uint16_t grid_dims;
-  uint8_t Unused1[190];
-};
-
-/// Print the error code and exit if \p code indicates an error.
-static void handle_error_impl(const char *file, int32_t line,
-                              hsa_status_t code) {
-  if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
-    return;
-
-  const char *desc;
-  if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
-    desc = "Unknown error";
-  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
-  exit(EXIT_FAILURE);
-}
-
-/// Generic interface for iterating using the HSA callbacks.
-template <typename elem_ty, typename func_ty, typename callback_ty>
-hsa_status_t iterate(func_ty func, callback_ty cb) {
-  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
-    callback_ty *unwrapped = static_cast<callback_ty *>(data);
-    return (*unwrapped)(elem);
-  };
-  return func(l, static_cast<void *>(&cb));
-}
-
-/// Generic interface for iterating using the HSA callbacks.
-template <typename elem_ty, typename func_ty, typename func_arg_ty,
-          typename callback_ty>
-hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
-  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
-    callback_ty *unwrapped = static_cast<callback_ty *>(data);
-    return (*unwrapped)(elem);
-  };
-  return func(func_arg, l, static_cast<void *>(&cb));
-}
-
-/// Iterate through all availible agents.
-template <typename callback_ty>
-hsa_status_t iterate_agents(callback_ty callback) {
-  return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
-}
-
-/// Iterate through all availible memory pools.
-template <typename callback_ty>
-hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
-  return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
-                                        agent, cb);
-}
-
-template <hsa_device_type_t flag>
-hsa_status_t get_agent(hsa_agent_t *output_agent) {
-  // Find the first agent with a matching device type.
-  auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
-    hsa_device_type_t type;
-    hsa_status_t status =
-        hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
-    if (status != HSA_STATUS_SUCCESS)
-      return status;
-
-    if (type == flag) {
-      // Ensure that a GPU agent supports kernel dispatch packets.
-      if (type == HSA_DEVICE_TYPE_GPU) {
-        hsa_agent_feature_t features;
-        status =
-            hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
-        if (status != HSA_STATUS_SUCCESS)
-          return status;
-        if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
-          *output_agent = hsa_agent;
-      } else {
-        *output_agent = hsa_agent;
-      }
-      return HSA_STATUS_INFO_BREAK;
-    }
-    return HSA_STATUS_SUCCESS;
-  };
-
-  return iterate_agents(cb);
-}
-
-void print_kernel_resources(const char *kernel_name) {
-  fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
-}
-
-/// Retrieve a global memory pool with a \p flag from the agent.
-template <hsa_amd_memory_pool_global_flag_t flag>
-hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
-                                   hsa_amd_memory_pool_t *output_pool) {
-  auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
-    uint32_t flags;
-    hsa_amd_segment_t segment;
-    if (auto err = hsa_amd_memory_pool_get_info(
-            memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
-      return err;
-    if (auto err = hsa_amd_memory_pool_get_info(
-            memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
-      return err;
-
-    if (segment != HSA_AMD_SEGMENT_GLOBAL)
-      return HSA_STATUS_SUCCESS;
-
-    if (flags & flag)
-      *output_pool = memory_pool;
-
-    return HSA_STATUS_SUCCESS;
-  };
-  return iterate_agent_memory_pools(agent, cb);
-}
-
-template <typename args_t>
-hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
-                           hsa_amd_memory_pool_t kernargs_pool,
-                           hsa_amd_memory_pool_t coarsegrained_pool,
-                           hsa_queue_t *queue, rpc::Server &server,
-                           const LaunchParameters &params,
-                           const char *kernel_name, args_t kernel_args,
-                           uint32_t wavefront_size, bool print_resource_usage) {
-  // Look up the kernel in the loaded executable.
-  hsa_executable_symbol_t symbol;
-  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, kernel_name, &dev_agent, &symbol))
-    return err;
-
-  // Retrieve different properties of the kernel symbol used for launch.
-  uint64_t kernel;
-  uint32_t args_size;
-  uint32_t group_size;
-  uint32_t private_size;
-  bool dynamic_stack;
-
-  std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
-
-  for (auto &[info, value] : symbol_infos)
-    if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
-      return err;
-
-  // Allocate space for the kernel arguments on the host and allow the GPU agent
-  // to access it.
-  void *args;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
-                                                      /*flags=*/0, &args))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
-
-  // Initialize all the arguments (explicit and implicit) to zero, then set the
-  // explicit arguments to the values created above.
-  std::memset(args, 0, args_size);
-  std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
-
-  // Initialize the necessary implicit arguments to the proper values.
-  int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
-             (params.num_blocks_z * params.num_threads_z != 1);
-  implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
-      reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
-  implicit_args->grid_dims = dims;
-  implicit_args->grid_size_x = params.num_blocks_x;
-  implicit_args->grid_size_y = params.num_blocks_y;
-  implicit_args->grid_size_z = params.num_blocks_z;
-  implicit_args->workgroup_size_x = params.num_threads_x;
-  implicit_args->workgroup_size_y = params.num_threads_y;
-  implicit_args->workgroup_size_z = params.num_threads_z;
-
-  // Obtain a packet from the queue.
-  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
-  while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
-    ;
-
-  const uint32_t mask = queue->size - 1;
-  hsa_kernel_dispatch_packet_t *packet =
-      static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
-      (packet_id & mask);
-
-  // Set up the packet for exeuction on the device. We currently only launch
-  // with one thread on the device, forcing the rest of the wavefront to be
-  // masked off.
-  uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet->workgroup_size_x = params.num_threads_x;
-  packet->workgroup_size_y = params.num_threads_y;
-  packet->workgroup_size_z = params.num_threads_z;
-  packet->reserved0 = 0;
-  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
-  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
-  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
-  packet->private_segment_size =
-      dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
-  packet->group_segment_size = group_size;
-  packet->kernel_object = kernel;
-  packet->kernarg_address = args;
-  packet->reserved2 = 0;
-  // Create a signal to indicate when this packet has been completed.
-  if (hsa_status_t err =
-          hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
-    handle_error(err);
-
-  if (print_resource_usage)
-    print_kernel_resources(kernel_name);
-
-  // Initialize the packet header and set the doorbell signal to begin execution
-  // by the HSA runtime.
-  uint16_t header =
-      1u << HSA_PACKET_HEADER_BARRIER |
-      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-  uint32_t header_word = header | (setup << 16u);
-  __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
-  hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
-
-  std::atomic<bool> finished = false;
-  std::thread server_thread(
-      [](std::atomic<bool> *finished, rpc::Server *server,
-         uint32_t wavefront_size, hsa_agent_t dev_agent,
-         hsa_amd_memory_pool_t coarsegrained_pool) {
-        // Register RPC callbacks for the malloc and free functions on HSA.
-        auto malloc_handler = [&](size_t size) -> void * {
-          void *dev_ptr = nullptr;
-          if (hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
-                                           /*flags=*/0, &dev_ptr))
-            dev_ptr = nullptr;
-          hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-          return dev_ptr;
-        };
-
-        auto free_handler = [](void *ptr) -> void {
-          if (hsa_status_t err =
-                  hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
-            handle_error(err);
-        };
-
-        uint32_t index = 0;
-        while (!*finished) {
-          if (wavefront_size == 32)
-            index =
-                handle_server<32>(*server, index, malloc_handler, free_handler);
-          else
-            index =
-                handle_server<64>(*server, index, malloc_handler, free_handler);
-        }
-      },
-      &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
-
-  // Wait until the kernel has completed execution on the device. Periodically
-  // check the RPC client for work to be performed on the server.
-  while (hsa_signal_wait_scacquire(packet->completion_signal,
-                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                   HSA_WAIT_STATE_BLOCKED) != 0)
-    ;
-
-  finished = true;
-  if (server_thread.joinable())
-    server_thread.join();
-
-  // Destroy the resources acquired to launch the kernel and return.
-  if (hsa_status_t err = hsa_amd_memory_pool_free(args))
-    handle_error(err);
-  if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
-    handle_error(err);
-
-  return HSA_STATUS_SUCCESS;
-}
-
-/// Copies data from the source agent to the destination agent. The source
-/// memory must first be pinned explicitly or allocated via HSA.
-static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
-                               const void *src, hsa_agent_t src_agent,
-                               uint64_t size) {
-  // Create a memory signal to copy information between the host and device.
-  hsa_signal_t memory_signal;
-  if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
-    return err;
-
-  if (hsa_status_t err = hsa_amd_memory_async_copy(
-          dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
-    return err;
-
-  while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                   UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
-    ;
-
-  if (hsa_status_t err = hsa_signal_destroy(memory_signal))
-    return err;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
-                size_t size, const LaunchParameters &params,
-                bool print_resource_usage) {
-  // Initialize the HSA runtime used to communicate with the device.
-  if (hsa_status_t err = hsa_init())
-    handle_error(err);
-
-  // Register a callback when the device encounters a memory fault.
-  if (hsa_status_t err = hsa_amd_register_system_event_handler(
-          [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
-            if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
-              return HSA_STATUS_ERROR;
-            return HSA_STATUS_SUCCESS;
-          },
-          nullptr))
-    handle_error(err);
-
-  // Obtain a single agent for the device and host to use the HSA memory model.
-  hsa_agent_t dev_agent;
-  hsa_agent_t host_agent;
-  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
-    handle_error(err);
-  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
-    handle_error(err);
-
-  // Load the code object's ISA information and executable data segments.
-  hsa_code_object_reader_t reader;
-  if (hsa_status_t err =
-          hsa_code_object_reader_create_from_memory(image, size, &reader))
-    handle_error(err);
-
-  hsa_executable_t executable;
-  if (hsa_status_t err = hsa_executable_create_alt(
-          HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
-          &executable))
-    handle_error(err);
-
-  hsa_loaded_code_object_t object;
-  if (hsa_status_t err = hsa_executable_load_agent_code_object(
-          executable, dev_agent, reader, "", &object))
-    handle_error(err);
-
-  // No modifications to the executable are allowed  after this point.
-  if (hsa_status_t err = hsa_executable_freeze(executable, ""))
-    handle_error(err);
-
-  // Check the validity of the loaded executable. If the agents ISA features do
-  // not match the executable's code object it will fail here.
-  uint32_t result;
-  if (hsa_status_t err = hsa_executable_validate(executable, &result))
-    handle_error(err);
-  if (result)
-    handle_error(HSA_STATUS_ERROR);
-
-  if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
-    handle_error(err);
-
-  // Obtain memory pools to exchange data between the host and the device. The
-  // fine-grained pool acts as pinned memory on the host for DMA transfers to
-  // the device, the coarse-grained pool is for allocations directly on the
-  // device, and the kernerl-argument pool is for executing the kernel.
-  hsa_amd_memory_pool_t kernargs_pool;
-  hsa_amd_memory_pool_t finegrained_pool;
-  hsa_amd_memory_pool_t coarsegrained_pool;
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
-              host_agent, &kernargs_pool))
-    handle_error(err);
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
-              host_agent, &finegrained_pool))
-    handle_error(err);
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
-              dev_agent, &coarsegrained_pool))
-    handle_error(err);
-
-  // The AMDGPU target can change its wavefront size. There currently isn't a
-  // good way to look this up through the HSA API so we use the LLVM interface.
-  uint16_t abi_version;
-  llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
-  llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
-  if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
-          llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
-    handle_error(llvm::toString(std::move(err)).c_str());
-  }
-
-  // Allocate fine-grained memory on the host to hold the pointer array for the
-  // copied argv and allow the GPU agent to access it.
-  auto allocator = [&](uint64_t size) -> void * {
-    void *dev_ptr = nullptr;
-    if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
-                                                        /*flags=*/0, &dev_ptr))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-    return dev_ptr;
-  };
-  void *dev_argv = copy_argument_vector(argc, argv, allocator);
-  if (!dev_argv)
-    handle_error("Failed to allocate device argv");
-
-  // Allocate fine-grained memory on the host to hold the pointer array for the
-  // copied environment array and allow the GPU agent to access it.
-  void *dev_envp = copy_environment(envp, allocator);
-  if (!dev_envp)
-    handle_error("Failed to allocate device environment");
-
-  // Allocate space for the return pointer and initialize it to zero.
-  void *dev_ret;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
-                                       /*flags=*/0, &dev_ret))
-    handle_error(err);
-  hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
-
-  // Allocate finegrained memory for the RPC server and client to share.
-  uint32_t wavefront_size =
-      llvm::max_element(info_map, [](auto &&x, auto &&y) {
-        return x.second.WavefrontSize < y.second.WavefrontSize;
-      })->second.WavefrontSize;
-
-  // Set up the RPC server.
-  void *rpc_buffer;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool,
-          rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
-          /*flags=*/0, &rpc_buffer))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
-
-  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
-  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
-
-  // Initialize the RPC client on the device by copying the local data to the
-  // device's internal pointer.
-  hsa_executable_symbol_t rpc_client_sym;
-  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
-    handle_error(err);
-
-  void *rpc_client_dev;
-  if (hsa_status_t err = hsa_executable_symbol_get_info(
-          rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
-          &rpc_client_dev))
-    handle_error(err);
-
-  void *rpc_client_buffer;
-  if (hsa_status_t err =
-          hsa_amd_memory_lock(&client, sizeof(rpc::Client),
-                              /*agents=*/nullptr, 0, &rpc_client_buffer))
-    handle_error(err);
-
-  // Copy the RPC client buffer to the address pointed to by the symbol.
-  if (hsa_status_t err =
-          hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
-                     sizeof(rpc::Client)))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_amd_memory_unlock(&client))
-    handle_error(err);
-
-  // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
-  // If the clock_freq symbol is missing, no work to do.
-  hsa_executable_symbol_t freq_sym;
-  if (HSA_STATUS_SUCCESS ==
-      hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
-                                        &dev_agent, &freq_sym)) {
-    void *host_clock_freq;
-    if (hsa_status_t err =
-            hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
-                                         /*flags=*/0, &host_clock_freq))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
-
-    if (HSA_STATUS_SUCCESS ==
-        hsa_agent_get_info(dev_agent,
-                           static_cast<hsa_agent_info_t>(
-                               HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
-                           host_clock_freq)) {
-
-      void *freq_addr;
-      if (hsa_status_t err = hsa_executable_symbol_get_info(
-              freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
-              &freq_addr))
-        handle_error(err);
-
-      if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
-                                        host_agent, sizeof(uint64_t)))
-        handle_error(err);
-    }
-  }
-
-  // Obtain a queue with the maximum (power of two) size, used to send commands
-  // to the HSA runtime and launch execution on the device.
-  uint64_t queue_size;
-  if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
-    handle_error(err);
-  hsa_queue_t *queue = nullptr;
-  if (hsa_status_t err =
-          hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
-                           nullptr, UINT32_MAX, UINT32_MAX, &queue))
-    handle_error(err);
-
-  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, single_threaded_params, "_begin.kd", init_args,
-          info_map["_begin"].WavefrontSize, print_resource_usage))
-    handle_error(err);
-
-  start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
-          print_resource_usage))
-    handle_error(err);
-
-  void *host_ret;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
-                                       /*flags=*/0, &host_ret))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
-
-  if (hsa_status_t err =
-          hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
-    handle_error(err);
-
-  // Save the return value and perform basic clean-up.
-  int ret = *static_cast<int *>(host_ret);
-
-  end_args_t fini_args = {};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, single_threaded_params, "_end.kd", fini_args,
-          info_map["_end"].WavefrontSize, print_resource_usage))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
-    handle_error(err);
-
-  // Free the memory allocated for the device.
-  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_queue_destroy(queue))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_executable_destroy(executable))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_shut_down())
-    handle_error(err);
-
-  return ret;
-}
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
index a8204664e85eb..d66f2b8d69b62 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
@@ -35,121 +35,255 @@
 
 using namespace llvm;
 
-static cl::OptionCategory loader_category("loader options");
+static cl::OptionCategory LoaderCategory("loader options");
 
-static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
-                          cl::cat(loader_category));
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
+                          cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
-              cl::init(1), cl::cat(loader_category));
-static cl::alias threads("threads", cl::aliasopt(threads_x),
+    Threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
+static cl::alias threads("threads", cl::aliasopt(Threads_x),
                          cl::desc("Alias for --threads-x"),
-                         cl::cat(loader_category));
+                         cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
-             cl::init(1), cl::cat(loader_category));
-static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
+    Blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
+static cl::alias Blocks("blocks", cl::aliasopt(Blocks_x),
                         cl::desc("Alias for --blocks-x"),
-                        cl::cat(loader_category));
+                        cl::cat(LoaderCategory));
 
-static cl::opt<bool>
-    print_resource_usage("print-resource-usage",
-                         cl::desc("Output resource usage of launched kernels"),
-                         cl::init(false), cl::cat(loader_category));
-
-static cl::opt<std::string> file(cl::Positional, cl::Required,
+static cl::opt<std::string> File(cl::Positional, cl::Required,
                                  cl::desc("<gpu executable>"),
-                                 cl::cat(loader_category));
-static cl::list<std::string> args(cl::ConsumeAfter,
+                                 cl::cat(LoaderCategory));
+static cl::list<std::string> Args(cl::ConsumeAfter,
                                   cl::desc("<program arguments>..."),
-                                  cl::cat(loader_category));
+                                  cl::cat(LoaderCategory));
+
+// The arguments to the '_begin' kernel.
+struct BeginArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+};
+
+// The arguments to the '_start' kernel.
+struct StartArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+  void *Ret;
+};
 
-[[noreturn]] void report_error(Error E) {
+// The arguments to the '_end' kernel.
+struct EndArgs {};
+
+[[noreturn]] static void handleError(Error E) {
   outs().flush();
   logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
   exit(EXIT_FAILURE);
 }
 
-std::string get_main_executable(const char *name) {
-  void *ptr = (void *)(intptr_t)&get_main_executable;
-  auto cow_path = sys::fs::getMainExecutable(name, ptr);
-  return sys::path::parent_path(cow_path).str();
+[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
+  fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details);
+  exit(EXIT_FAILURE);
+}
+
+#define OFFLOAD_ERR(X)                                                         \
+  if (ol_result_t Err = X)                                                     \
+    handleError(Err, __LINE__);
+
+static void *copyArgumentVector(int Argc, const char **Argv,
+                                ol_device_handle_t Device) {
+  size_t ArgSize = sizeof(char *) * (Argc + 1);
+  size_t StringLen = 0;
+  for (int i = 0; i < Argc; ++i)
+    StringLen += strlen(Argv[i]) + 1;
+
+  // We allocate enough space for a null terminated array and all the strings.
+  void *DevArgv;
+  OFFLOAD_ERR(
+      olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
+  if (!DevArgv)
+    handleError(
+        createStringError("Failed to allocate memory for environment."));
+
+  // Store the strings linerally in the same memory buffer.
+  void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
+  for (int i = 0; i < Argc; ++i) {
+    size_t size = strlen(Argv[i]) + 1;
+    std::memcpy(DevString, Argv[i], size);
+    static_cast<void **>(DevArgv)[i] = DevString;
+    DevString = reinterpret_cast<uint8_t *>(DevString) + size;
+  }
+
+  // Ensure the vector is null terminated.
+  reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
+  return DevArgv;
+}
+
+void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
+  int Envc = 0;
+  for (const char **Env = Envp; *Env != 0; ++Env)
+    ++Envc;
+
+  return copyArgumentVector(Envc, Envp, Device);
+}
+
+ol_device_handle_t findDevice(MemoryBufferRef Binary) {
+  ol_device_handle_t Device;
+  std::tuple Data = std::make_tuple(&Device, &Binary);
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
+        bool IsValid = false;
+        OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
+                                    Binary->getBufferSize(), &IsValid));
+        if (!IsValid)
+          return true;
+
+        *Output = Device;
+        return false;
+      },
+      &Data));
+  return Device;
+}
+
+ol_device_handle_t getHostDevice() {
+  ol_device_handle_t Device;
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        ol_platform_handle_t Platform;
+        olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
+                        &Platform);
+        ol_platform_backend_t Backend;
+        olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
+                          &Backend);
+
+        auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
+        if (Backend == OL_PLATFORM_BACKEND_HOST) {
+          Output = Device;
+          return false;
+        }
+        return true;
+      },
+      &Device));
+  return Device;
+}
+
+ol_program_handle_t loadBinary(std::vector<char> &Binary,
+                               std::vector<ol_device_handle_t> &Devices) {
+  for (ol_device_handle_t &Device : Devices) {
+    bool IsValid = false;
+    OFFLOAD_ERR(
+        olIsValidBinary(Device, Binary.data(), Binary.size(), &IsValid));
+    if (!IsValid)
+      continue;
+
+    ol_program_handle_t Program;
+    OFFLOAD_ERR(
+        olCreateProgram(Device, Binary.data(), Binary.size(), &Program));
+    return Program;
+  }
+  handleError(
+      createStringError("No valid device found for '%s'", File.c_str()));
+}
+
+template <typename Args>
+void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                  ol_program_handle_t Program, const char *Name,
+                  ol_kernel_launch_size_args_t LaunchArgs, Args KernelArgs) {
+  ol_symbol_handle_t Kernel;
+  OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+  OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs,
+                             std::is_empty_v<Args> ? 0 : sizeof(Args),
+                             &LaunchArgs));
 }
 
 int main(int argc, const char **argv, const char **envp) {
   sys::PrintStackTraceOnErrorSignal(argv[0]);
-  cl::HideUnrelatedOptions(loader_category);
+  cl::HideUnrelatedOptions(LoaderCategory);
   cl::ParseCommandLineOptions(
       argc, argv,
       "A utility used to launch unit tests built for a GPU target. This is\n"
       "intended to provide an intrface simular to cross-compiling emulators\n");
 
-  if (help) {
+  if (Help) {
     cl::PrintHelpMessage();
     return EXIT_SUCCESS;
   }
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
-      MemoryBuffer::getFileOrSTDIN(file);
-  if (std::error_code ec = image_or_err.getError())
-    report_error(errorCodeToError(ec));
-  MemoryBufferRef image = **image_or_err;
-
-  SmallVector<const char *> new_argv = {file.c_str()};
-  llvm::transform(args, std::back_inserter(new_argv),
-                  [](const std::string &arg) { return arg.c_str(); });
-
-  Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
-      llvm::object::ELF64LEObjectFile::create(image);
-  if (!elf_or_err)
-    report_error(elf_or_err.takeError());
-
-  int ret = 1;
-  if (elf_or_err->getArch() == Triple::amdgcn) {
-#ifdef AMDHSA_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
-                      const_cast<char *>(image.getBufferStart()),
-                      image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else if (elf_or_err->getArch() == Triple::nvptx64) {
-#ifdef NVPTX_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
-                     const_cast<char *>(image.getBufferStart()),
-                     image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else {
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-  }
+  if (Error Err = loadLLVMOffload())
+    handleError(std::move(Err));
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
+      MemoryBuffer::getFileOrSTDIN(File);
+  if (std::error_code EC = ImageOrErr.getError())
+    handleError(errorCodeToError(EC));
+  MemoryBufferRef Image = **ImageOrErr;
+
+  SmallVector<const char *> NewArgv = {File.c_str()};
+  llvm::transform(Args, std::back_inserter(NewArgv),
+                  [](const std::string &Arg) { return Arg.c_str(); });
+
+  OFFLOAD_ERR(olInit());
+  ol_device_handle_t Device = findDevice(Image);
+  ol_device_handle_t Host = getHostDevice();
+
+  ol_program_handle_t Program;
+  OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
+                              Image.getBufferSize(), &Program));
+
+  ol_queue_handle_t Queue;
+  OFFLOAD_ERR(olCreateQueue(Device, &Queue));
+
+  int DevArgc = static_cast<int>(NewArgv.size());
+  void *DevArgv = copyArgumentVector(NewArgv.size(), NewArgv.begin(), Device);
+  void *DevEnvp = copyEnvironment(envp, Device);
+
+  void *DevRet;
+  OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
+
+  ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
+  BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp};
+  launchKernel(Queue, Device, Program, "_begin", BeginLaunch, BeginArgs);
+  OFFLOAD_ERR(olSyncQueue(Queue));
+
+  uint32_t Dims = (Blocks_z > 1) ? 3 : (Blocks_y > 1) ? 2 : 1;
+  ol_kernel_launch_size_args_t StartLaunch{Dims,
+                                           {Blocks_x, Blocks_y, Blocks_z},
+                                           {Threads_x, Threads_y, Threads_z},
+                                           /*SharedMemBytes=*/0};
+  StartArgs StartArgs = {DevArgc, DevArgv, DevEnvp, DevRet};
+  launchKernel(Queue, Device, Program, "_start", StartLaunch, StartArgs);
+
+  ol_kernel_launch_size_args_t EndLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
+  EndArgs EndArgs = {};
+  launchKernel(Queue, Device, Program, "_end", EndLaunch, EndArgs);
+
+  int Ret;
+  OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
+  OFFLOAD_ERR(olSyncQueue(Queue));
+
+  OFFLOAD_ERR(olMemFree(DevArgv));
+  OFFLOAD_ERR(olMemFree(DevEnvp));
+  OFFLOAD_ERR(olDestroyQueue(Queue));
+  OFFLOAD_ERR(olDestroyProgram(Program));
+  OFFLOAD_ERR(olShutDown());
 
-  return ret;
+  return Ret;
 }
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index 08861c29b4fa4..3990cb3911e30 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -1,108 +1,177 @@
-//===-- Generic device loader interface -----------------------------------===//
+//===-- Dynamically loaded offload API ------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Dynamically loads the API provided by the LLVMOffload library. We need to do
+// this dynamically because this tool is used before it is actually built and
+// should be provided even when the user did not specify the offload runtime.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
 #define LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
 
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-/// Generic launch parameters for configuration the number of blocks / threads.
-struct LaunchParameters {
-  uint32_t num_threads_x;
-  uint32_t num_threads_y;
-  uint32_t num_threads_z;
-  uint32_t num_blocks_x;
-  uint32_t num_blocks_y;
-  uint32_t num_blocks_z;
-};
-
-/// The arguments to the '_begin' kernel.
-struct begin_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-};
-
-/// The arguments to the '_start' kernel.
-struct start_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-  void *ret;
-};
-
-/// The arguments to the '_end' kernel.
-struct end_args_t {};
-
-/// Generic interface to load the \p image and launch execution of the _start
-/// kernel on the target device. Copies \p argc and \p argv to the device.
-/// Returns the final value of the `main` function on the device.
-#ifdef AMDHSA_SUPPORT
-int load_amdhsa(int argc, const char **argv, const char **evnp, void *image,
-                size_t size, const LaunchParameters &params,
-                bool print_resource_usage);
-#endif
-#ifdef NVPTX_SUPPORT
-int load_nvptx(int argc, const char **argv, const char **evnp, void *image,
-               size_t size, const LaunchParameters &params,
-               bool print_resource_usage);
-#endif
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename V, typename A> inline V align_up(V val, A align) {
-  return ((val + V(align) - 1) / V(align)) * V(align);
-}
-
-/// Copy the system's argument vector to GPU memory allocated using \p alloc.
-template <typename Allocator>
-void *copy_argument_vector(int argc, const char **argv, Allocator alloc) {
-  size_t argv_size = sizeof(char *) * (argc + 1);
-  size_t str_size = 0;
-  for (int i = 0; i < argc; ++i)
-    str_size += strlen(argv[i]) + 1;
-
-  // We allocate enough space for a null terminated array and all the strings.
-  void *dev_argv = alloc(argv_size + str_size);
-  if (!dev_argv)
-    return nullptr;
-
-  // Store the strings linerally in the same memory buffer.
-  void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
-  for (int i = 0; i < argc; ++i) {
-    size_t size = strlen(argv[i]) + 1;
-    std::memcpy(dev_str, argv[i], size);
-    static_cast<void **>(dev_argv)[i] = dev_str;
-    dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
-  }
-
-  // Ensure the vector is null terminated.
-  reinterpret_cast<void **>(dev_argv)[argc] = nullptr;
-  return dev_argv;
-}
-
-/// Copy the system's environment to GPU memory allocated using \p alloc.
-template <typename Allocator>
-void *copy_environment(const char **envp, Allocator alloc) {
-  int envc = 0;
-  for (const char **env = envp; *env != 0; ++env)
-    ++envc;
-
-  return copy_argument_vector(envc, envp, alloc);
-}
-
-inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
-  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
-  exit(EXIT_FAILURE);
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+
+typedef enum ol_alloc_type_t {
+  OL_ALLOC_TYPE_HOST = 0,
+  OL_ALLOC_TYPE_DEVICE = 1,
+  OL_ALLOC_TYPE_FORCE_UINT32 = 0x7fffffff
+} ol_alloc_type_t;
+
+typedef enum ol_device_info_t {
+  OL_DEVICE_INFO_TYPE = 0,
+  OL_DEVICE_INFO_PLATFORM = 1,
+  OL_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
+} ol_device_info_t;
+
+typedef enum ol_platform_info_t {
+  OL_PLATFORM_INFO_NAME = 0,
+  OL_PLATFORM_INFO_BACKEND = 3,
+  OL_PLATFORM_INFO_FORCE_UINT32 = 0x7fffffff
+} ol_platform_info_t;
+
+typedef enum ol_symbol_kind_t {
+  OL_SYMBOL_KIND_KERNEL = 0,
+  OL_SYMBOL_KIND_GLOBAL_VARIABLE = 1,
+  OL_SYMBOL_KIND_FORCE_UINT32 = 0x7fffffff
+} ol_symbol_kind_t;
+
+typedef enum ol_errc_t {
+  OL_ERRC_SUCCESS = 0,
+  OL_ERRC_FORCE_UINT32 = 0x7fffffff
+} ol_errc_t;
+
+typedef struct ol_error_struct_t {
+  ol_errc_t Code;
+  const char *Details;
+} ol_error_struct_t;
+
+typedef struct ol_dimensions_t {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+} ol_dimensions_t;
+
+typedef struct ol_kernel_launch_size_args_t {
+  size_t Dimensions;
+  struct ol_dimensions_t NumGroups;
+  struct ol_dimensions_t GroupSize;
+  size_t DynSharedMemory;
+} ol_kernel_launch_size_args_t;
+
+typedef enum ol_platform_backend_t {
+  OL_PLATFORM_BACKEND_UNKNOWN = 0,
+  OL_PLATFORM_BACKEND_CUDA = 1,
+  OL_PLATFORM_BACKEND_AMDGPU = 2,
+  OL_PLATFORM_BACKEND_HOST = 3,
+  OL_PLATFORM_BACKEND_LAST = 4,
+  OL_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff
+} ol_platform_backend_t;
+
+typedef struct ol_device_impl_t *ol_device_handle_t;
+typedef struct ol_platform_impl_t *ol_platform_handle_t;
+typedef struct ol_program_impl_t *ol_program_handle_t;
+typedef struct ol_queue_impl_t *ol_queue_handle_t;
+typedef struct ol_symbol_impl_t *ol_symbol_handle_t;
+typedef const struct ol_error_struct_t *ol_result_t;
+
+typedef bool (*ol_device_iterate_cb_t)(ol_device_handle_t Device,
+                                       void *UserData);
+
+ol_result_t (*olInit)();
+ol_result_t (*olShutDown)();
+
+ol_result_t (*olIterateDevices)(ol_device_iterate_cb_t Callback,
+                                void *UserData);
+
+ol_result_t (*olIsValidBinary)(ol_device_handle_t Device, const void *ProgData,
+                               size_t ProgDataSize, bool *Valid);
+
+ol_result_t (*olCreateProgram)(ol_device_handle_t Device, const void *ProgData,
+                               size_t ProgDataSize,
+                               ol_program_handle_t *Program);
+
+ol_result_t (*olDestroyProgram)(ol_program_handle_t Program);
+
+ol_result_t (*olGetSymbol)(ol_program_handle_t Program, const char *Name,
+                           ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol);
+
+ol_result_t (*olLaunchKernel)(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs);
+
+ol_result_t (*olCreateQueue)(ol_device_handle_t Device,
+                             ol_queue_handle_t *Queue);
+
+ol_result_t (*olDestroyQueue)(ol_queue_handle_t Queue);
+
+ol_result_t (*olSyncQueue)(ol_queue_handle_t Queue);
+
+ol_result_t (*olMemAlloc)(ol_device_handle_t Device, ol_alloc_type_t Type,
+                          size_t Size, void **AllocationOut);
+
+ol_result_t (*olMemFree)(void *Address);
+
+ol_result_t (*olMemcpy)(ol_queue_handle_t Queue, void *DstPtr,
+                        ol_device_handle_t DstDevice, const void *SrcPtr,
+                        ol_device_handle_t SrcDevice, size_t Size);
+
+ol_result_t (*olGetDeviceInfo)(ol_device_handle_t Device,
+                               ol_device_info_t PropName, size_t PropSize,
+                               void *PropValue);
+
+ol_result_t (*olGetPlatformInfo)(ol_platform_handle_t Platform,
+                                 ol_platform_info_t PropName, size_t PropSize,
+                                 void *PropValue);
+
+llvm::Error loadLLVMOffload() {
+  constexpr const char *OffloadLibrary = "libLLVMOffload.so";
+
+  std::string ErrMsg;
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(OffloadLibrary, &ErrMsg));
+
+  if (!DynlibHandle->isValid())
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Failed to dlopen %s: %s", OffloadLibrary,
+                                   ErrMsg.c_str());
+
+#define DYNAMIC_INIT(SYM)                                                      \
+  do {                                                                         \
+    void *Ptr = DynlibHandle->getAddressOfSymbol(#SYM);                        \
+    if (!Ptr)                                                                  \
+      return llvm::createStringError(                                          \
+          llvm::inconvertibleErrorCode(), "Missing symbol '%s' in %s",         \
+          reinterpret_cast<const char *>(#SYM), OffloadLibrary);               \
+    SYM = reinterpret_cast<decltype(SYM)>(Ptr);                                \
+  } while (0)
+
+  DYNAMIC_INIT(olInit);
+  DYNAMIC_INIT(olShutDown);
+  DYNAMIC_INIT(olIterateDevices);
+  DYNAMIC_INIT(olIsValidBinary);
+  DYNAMIC_INIT(olCreateProgram);
+  DYNAMIC_INIT(olDestroyProgram);
+  DYNAMIC_INIT(olGetSymbol);
+  DYNAMIC_INIT(olLaunchKernel);
+  DYNAMIC_INIT(olCreateQueue);
+  DYNAMIC_INIT(olDestroyQueue);
+  DYNAMIC_INIT(olSyncQueue);
+  DYNAMIC_INIT(olMemAlloc);
+  DYNAMIC_INIT(olMemFree);
+  DYNAMIC_INIT(olMemcpy);
+  DYNAMIC_INIT(olGetDeviceInfo);
+  DYNAMIC_INIT(olGetPlatformInfo);
+#undef DYNAMIC_INIT
+
+  return llvm::Error::success();
 }
-#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
 
 #endif // LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
deleted file mode 100644
index 82b455249ad24..0000000000000
--- a/llvm/tools/llvm-gpu-loader/nvptx.cpp
+++ /dev/null
@@ -1,367 +0,0 @@
-//===-- Loader Implementation for NVPTX devices --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file impelements a simple loader to run images supporting the NVPTX
-// architecture. The file launches the '_start' kernel which should be provided
-// by the device application start code and call ultimately call the 'main'
-// function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-gpu-loader.h"
-#include "server.h"
-
-#include "cuda.h"
-
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <atomic>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <thread>
-#include <vector>
-
-using namespace llvm;
-using namespace object;
-
-static void handle_error_impl(const char *file, int32_t line, CUresult err) {
-  if (err == CUDA_SUCCESS)
-    return;
-
-  const char *err_str = nullptr;
-  CUresult result = cuGetErrorString(err, &err_str);
-  if (result != CUDA_SUCCESS)
-    fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
-  else
-    fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
-  exit(1);
-}
-
-// Gets the names of all the globals that contain functions to initialize or
-// deinitialize. We need to do this manually because the NVPTX toolchain does
-// not contain the necessary binary manipulation tools.
-template <typename Alloc>
-Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
-                                     Alloc allocator, CUmodule binary) {
-  auto mem_buffer = MemoryBuffer::getMemBuffer(
-      StringRef(reinterpret_cast<const char *>(image), size), "image",
-      /*RequiresNullTerminator=*/false);
-  Expected<ELF64LEObjectFile> elf_or_err =
-      ELF64LEObjectFile::create(*mem_buffer);
-  if (!elf_or_err)
-    handle_error(toString(elf_or_err.takeError()).c_str());
-
-  std::vector<std::pair<const char *, uint16_t>> ctors;
-  std::vector<std::pair<const char *, uint16_t>> dtors;
-  // CUDA has no way to iterate over all the symbols so we need to inspect the
-  // ELF directly using the LLVM libraries.
-  for (const auto &symbol : elf_or_err->symbols()) {
-    auto name_or_err = symbol.getName();
-    if (!name_or_err)
-      handle_error(toString(name_or_err.takeError()).c_str());
-
-    // Search for all symbols that contain a constructor or destructor.
-    if (!name_or_err->starts_with("__init_array_object_") &&
-        !name_or_err->starts_with("__fini_array_object_"))
-      continue;
-
-    uint16_t priority;
-    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
-      handle_error("Invalid priority for constructor or destructor");
-
-    if (name_or_err->starts_with("__init"))
-      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
-    else
-      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
-  }
-  // Lower priority constructors are run before higher ones. The reverse is true
-  // for destructors.
-  llvm::sort(ctors, llvm::less_second());
-  llvm::sort(dtors, llvm::less_second());
-
-  // Allocate host pinned memory to make these arrays visible to the GPU.
-  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
-      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
-  uint64_t global_size = 0;
-
-  // Get the address of the global and then store the address of the constructor
-  // function to call in the constructor array.
-  CUdeviceptr *dev_ctors_start = dev_memory;
-  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
-  for (uint64_t i = 0; i < ctors.size(); ++i) {
-    CUdeviceptr dev_ptr;
-    if (CUresult err =
-            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
-      handle_error(err);
-    if (CUresult err =
-            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
-      handle_error(err);
-  }
-
-  // Get the address of the global and then store the address of the destructor
-  // function to call in the destructor array.
-  CUdeviceptr *dev_dtors_start = dev_ctors_end;
-  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
-  for (uint64_t i = 0; i < dtors.size(); ++i) {
-    CUdeviceptr dev_ptr;
-    if (CUresult err =
-            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
-      handle_error(err);
-    if (CUresult err =
-            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
-      handle_error(err);
-  }
-
-  // Obtain the address of the pointers the startup implementation uses to
-  // iterate the constructors and destructors.
-  CUdeviceptr init_start;
-  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
-                                       "__init_array_start"))
-    handle_error(err);
-  CUdeviceptr init_end;
-  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
-                                       "__init_array_end"))
-    handle_error(err);
-  CUdeviceptr fini_start;
-  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
-                                       "__fini_array_start"))
-    handle_error(err);
-  CUdeviceptr fini_end;
-  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
-                                       "__fini_array_end"))
-    handle_error(err);
-
-  // Copy the pointers to the newly written array to the symbols so the startup
-  // implementation can iterate them.
-  if (CUresult err =
-          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err =
-          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
-    handle_error(err);
-
-  return dev_memory;
-}
-
-void print_kernel_resources(CUmodule binary, const char *kernel_name) {
-  CUfunction function;
-  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
-    handle_error(err);
-  int num_regs;
-  if (CUresult err =
-          cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
-    handle_error(err);
-  printf("Executing kernel %s:\n", kernel_name);
-  printf("%6s registers: %d\n", kernel_name, num_regs);
-}
-
-template <typename args_t>
-CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
-                       const LaunchParameters &params, const char *kernel_name,
-                       args_t kernel_args, bool print_resource_usage) {
-  // look up the '_start' kernel in the loaded module.
-  CUfunction function;
-  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
-    handle_error(err);
-
-  // Set up the arguments to the '_start' kernel on the GPU.
-  uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
-  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
-                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
-                         CU_LAUNCH_PARAM_END};
-  if (print_resource_usage)
-    print_kernel_resources(binary, kernel_name);
-
-  // Initialize a non-blocking CUDA stream to allocate memory if needed.
-  // This needs to be done on a separate stream or else it will deadlock
-  // with the executing kernel.
-  CUstream memory_stream;
-  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
-    handle_error(err);
-
-  std::atomic<bool> finished = false;
-  std::thread server_thread(
-      [](std::atomic<bool> *finished, rpc::Server *server,
-         CUstream memory_stream) {
-        auto malloc_handler = [&](size_t size) -> void * {
-          CUdeviceptr dev_ptr;
-          if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
-            dev_ptr = 0UL;
-
-          // Wait until the memory allocation is complete.
-          while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
-            ;
-          return reinterpret_cast<void *>(dev_ptr);
-        };
-
-        auto free_handler = [&](void *ptr) -> void {
-          if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
-                                            memory_stream))
-            handle_error(err);
-        };
-
-        uint32_t index = 0;
-        while (!*finished) {
-          index =
-              handle_server<32>(*server, index, malloc_handler, free_handler);
-        }
-      },
-      &finished, &server, memory_stream);
-
-  // Call the kernel with the given arguments.
-  if (CUresult err = cuLaunchKernel(
-          function, params.num_blocks_x, params.num_blocks_y,
-          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
-          params.num_threads_z, 0, stream, nullptr, args_config))
-    handle_error(err);
-
-  if (CUresult err = cuStreamSynchronize(stream))
-    handle_error(err);
-
-  finished = true;
-  if (server_thread.joinable())
-    server_thread.join();
-
-  return CUDA_SUCCESS;
-}
-
-int load_nvptx(int argc, const char **argv, const char **envp, void *image,
-               size_t size, const LaunchParameters &params,
-               bool print_resource_usage) {
-  if (CUresult err = cuInit(0))
-    handle_error(err);
-  // Obtain the first device found on the system.
-  uint32_t device_id = 0;
-  CUdevice device;
-  if (CUresult err = cuDeviceGet(&device, device_id))
-    handle_error(err);
-
-  // Initialize the CUDA context and claim it for this execution.
-  CUcontext context;
-  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
-    handle_error(err);
-  if (CUresult err = cuCtxSetCurrent(context))
-    handle_error(err);
-
-  // Increase the stack size per thread.
-  // TODO: We should allow this to be passed in so only the tests that require a
-  // larger stack can specify it to save on memory usage.
-  if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
-    handle_error(err);
-
-  // Initialize a non-blocking CUDA stream to execute the kernel.
-  CUstream stream;
-  if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
-    handle_error(err);
-
-  // Load the image into a CUDA module.
-  CUmodule binary;
-  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
-    handle_error(err);
-
-  // Allocate pinned memory on the host to hold the pointer array for the
-  // copied argv and allow the GPU device to access it.
-  auto allocator = [&](uint64_t size) -> void * {
-    void *dev_ptr;
-    if (CUresult err = cuMemAllocHost(&dev_ptr, size))
-      handle_error(err);
-    return dev_ptr;
-  };
-
-  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
-  if (!memory_or_err)
-    handle_error(toString(memory_or_err.takeError()).c_str());
-
-  void *dev_argv = copy_argument_vector(argc, argv, allocator);
-  if (!dev_argv)
-    handle_error("Failed to allocate device argv");
-
-  // Allocate pinned memory on the host to hold the pointer array for the
-  // copied environment array and allow the GPU device to access it.
-  void *dev_envp = copy_environment(envp, allocator);
-  if (!dev_envp)
-    handle_error("Failed to allocate device environment");
-
-  // Allocate space for the return pointer and initialize it to zero.
-  CUdeviceptr dev_ret;
-  if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
-    handle_error(err);
-  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
-    handle_error(err);
-
-  uint32_t warp_size = 32;
-  void *rpc_buffer = nullptr;
-  if (CUresult err = cuMemAllocHost(
-          &rpc_buffer,
-          rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
-    handle_error(err);
-  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
-  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
-
-  // Initialize the RPC client on the device by copying the local data to the
-  // device's internal pointer.
-  CUdeviceptr rpc_client_dev = 0;
-  uint64_t client_ptr_size = sizeof(void *);
-  if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
-                                       binary, "__llvm_rpc_client"))
-    handle_error(err);
-
-  if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
-    handle_error(err);
-
-  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp};
-  if (CUresult err =
-          launch_kernel(binary, stream, server, single_threaded_params,
-                        "_begin", init_args, print_resource_usage))
-    handle_error(err);
-
-  start_args_t args = {argc, dev_argv, dev_envp,
-                       reinterpret_cast<void *>(dev_ret)};
-  if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
-                                   args, print_resource_usage))
-    handle_error(err);
-
-  // Copy the return value back from the kernel and wait.
-  int host_ret = 0;
-  if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
-    handle_error(err);
-
-  if (CUresult err = cuStreamSynchronize(stream))
-    handle_error(err);
-
-  end_args_t fini_args = {};
-  if (CUresult err =
-          launch_kernel(binary, stream, server, single_threaded_params, "_end",
-                        fini_args, print_resource_usage))
-    handle_error(err);
-
-  // Free the memory allocated for the device.
-  if (CUresult err = cuMemFreeHost(*memory_or_err))
-    handle_error(err);
-  if (CUresult err = cuMemFree(dev_ret))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(dev_argv))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(rpc_buffer))
-    handle_error(err);
-
-  // Destroy the context and the loaded binary.
-  if (CUresult err = cuModuleUnload(binary))
-    handle_error(err);
-  if (CUresult err = cuDevicePrimaryCtxRelease(device))
-    handle_error(err);
-  return host_ret;
-}
diff --git a/llvm/tools/llvm-gpu-loader/server.h b/llvm/tools/llvm-gpu-loader/server.h
deleted file mode 100644
index da73cc007f5d5..0000000000000
--- a/llvm/tools/llvm-gpu-loader/server.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- Common RPC server handler -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H
-#define LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H
-
-#include <cstddef>
-#include <cstdint>
-
-#include "shared/rpc.h"
-#include "shared/rpc_opcodes.h"
-#include "shared/rpc_server.h"
-
-template <uint32_t num_lanes, typename Alloc, typename Free>
-inline uint32_t handle_server(rpc::Server &server, uint32_t index,
-                              Alloc &&alloc, Free &&free) {
-  auto port = server.try_open(num_lanes, index);
-  if (!port)
-    return 0;
-  index = port->get_index() + 1;
-
-  int status = rpc::RPC_SUCCESS;
-  switch (port->get_opcode()) {
-  case LIBC_MALLOC: {
-    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
-    });
-    break;
-  }
-  case LIBC_FREE: {
-    port->recv([&](rpc::Buffer *buffer, uint32_t) {
-      free(reinterpret_cast<void *>(buffer->data[0]));
-    });
-    break;
-  }
-  default:
-    status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*port, num_lanes);
-    break;
-  }
-
-  // Handle all of the `libc` specific opcodes.
-  if (status != rpc::RPC_SUCCESS)
-    handle_error("Error handling RPC server");
-
-  port->close();
-
-  return index;
-}
-
-#endif // LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H