[libc] Implement basic malloc and free support on the GPU

This patch adds support for the `malloc` and `free` functions. These currently aren't implemented in-tree so we first add the interface filies. This patch provides the most basic support for a true `malloc` and `free` by using the RPC interface. This is functional, but in the future we will want to implement a more intelligent system and primarily use the RPC interface more as a `brk()` or `sbrk()` interface only called when absolutely necessary. We will need to design an intelligent allocator in the future. The semantics of these memory allocations will need to be checked. I am somewhat iffy on the details. I've heard that HSA can allocate asynchronously which seems to work with my tests at least. CUDA uses an implicit synchronization scheme so we need to use an explicitly separate stream from the one launching the kernel or the default stream. I will need to test the NVPTX case. I would appreciate if anyone more experienced with the implementation details here could chime in for the HSA and CUDA cases. Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D151735
llvm · Jun 5, 2023 · a621308 · a621308
1 parent 30bd96f
commit a621308
Show file tree

Hide file tree

Showing 13 changed files with 254 additions and 21 deletions.
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
@@ -35,9 +35,11 @@ enum Opcode : uint16_t {
   NOOP = 0,
   PRINT_TO_STDERR = 1,
   EXIT = 2,
-  TEST_INCREMENT = 3,
-  TEST_INTERFACE = 4,
-  TEST_STREAM = 5,
+  MALLOC = 3,
+  FREE = 4,
+  TEST_INCREMENT = 5,
+  TEST_INTERFACE = 6,
+  TEST_STREAM = 7,
 };
 
 /// A fixed size channel used to communicate between the RPC client and server.

diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
@@ -287,10 +287,23 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
     DEPENDS
       ${SCUDO_DEPS}
   )
+elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_external(
+    calloc
+  )
+  add_entrypoint_external(
+    realloc
+  )
+  add_entrypoint_external(
+    aligned_alloc
+  )
 else()
   add_entrypoint_external(
     malloc
   )
+  add_entrypoint_external(
+    free
+  )
   add_entrypoint_external(
     calloc
   )
@@ -300,9 +313,6 @@ else()
   add_entrypoint_external(
     aligned_alloc
   )
-  add_entrypoint_external(
-    free
-  )
 endif()
 
 if(NOT LLVM_LIBC_FULL_BUILD)
@@ -356,3 +366,19 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.abort
 )
+
+if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_object(
+    malloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.malloc
+  )
+
+  add_entrypoint_object(
+    free
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.free
+  )
+endif()
diff --git a/libc/src/stdlib/free.h b/libc/src/stdlib/free.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for free --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_FREE_H
+#define LLVM_LIBC_SRC_STDLIB_FREE_H
+
+namespace __llvm_libc {
+
+void free(void *ptr);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
diff --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_entrypoint_object(
+  malloc
+  SRCS
+    malloc.cpp
+  HDRS
+    ../malloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)
+
+add_entrypoint_object(
+  free
+  SRCS
+    free.cpp
+  HDRS
+    ../free.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)
diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp
@@ -0,0 +1,23 @@
+//===-- GPU Implementation of free ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void, free, (void *ptr)) {
+  rpc::Client::Port port = rpc::client.open<rpc::FREE>();
+  port.send([=](rpc::Buffer *buffer) {
+    buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
+  });
+  port.close();
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp
@@ -0,0 +1,26 @@
+//===-- GPU Implementation of malloc --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/malloc.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
+  void *ptr = nullptr;
+  rpc::Client::Port port = rpc::client.open<rpc::MALLOC>();
+  port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
+                     [&](rpc::Buffer *buffer) {
+                       ptr = reinterpret_cast<void *>(buffer->data[0]);
+                     });
+  port.close();
+  return ptr;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/malloc.h b/libc/src/stdlib/malloc.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for malloc ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_MALLOC_H
+#define LLVM_LIBC_SRC_STDLIB_MALLOC_H
+
+namespace __llvm_libc {
+
+void *malloc(size_t size);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
@@ -320,4 +320,19 @@ if(LLVM_LIBC_FULL_BUILD)
       libc.src.signal.raise
   )
 
+  # Only the GPU has an in-tree 'malloc' implementation.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    add_libc_test(
+      malloc_test
+      HERMETIC_TEST_ONLY
+      SUITE
+        libc-stdlib-tests
+      SRCS
+        malloc_test.cpp
+      DEPENDS
+        libc.include.stdlib
+        libc.src.stdlib.malloc
+        libc.src.stdlib.free
+    )
+  endif()
 endif()
diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp
@@ -0,0 +1,19 @@
+//===-- Unittests for malloc ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMallocTest, Allocate) {
+  int *ptr = reinterpret_cast<int *>(__llvm_libc::malloc(sizeof(int)));
+  EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  *ptr = 1;
+  EXPECT_EQ(*ptr, 1);
+  __llvm_libc::free(ptr);
+}
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
@@ -14,10 +14,17 @@ endif()
 find_package(CUDAToolkit QUIET)
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
 find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND)
+if(CUDAToolkit_FOUND AND LLVM_FOUND AND
+   ${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.2")
   add_subdirectory(nvptx)
 else()
-  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  if(${CUDAToolkit_VERSION} VERSION_LESS "11.2")
+    message(WARNING 
+      "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
+       Found CUDA Version ${CUDAToolkit_VERSION}")
+  else()
+    message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  endif()
 endif()
 
 # Add a custom target to be used for testing.

diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
@@ -21,7 +21,8 @@ static __llvm_libc::rpc::Server server;
 
 /// Queries the RPC client at least once and performs server-side work if there
 /// are any active requests.
-void handle_server() {
+template <typename Alloc, typename Dealloc>
+void handle_server(Alloc allocator, Dealloc deallocator) {
   using namespace __llvm_libc;
 
   // Continue servicing the client until there is no work left and we return.
@@ -50,6 +51,19 @@ void handle_server() {
       });
       break;
     }
+    case rpc::Opcode::MALLOC: {
+      port->recv_and_send([&](rpc::Buffer *buffer) {
+        buffer->data[0] =
+            reinterpret_cast<uintptr_t>(allocator(buffer->data[0]));
+      });
+      break;
+    }
+    case rpc::Opcode::FREE: {
+      port->recv([&](rpc::Buffer *buffer) {
+        deallocator(reinterpret_cast<void *>(buffer->data[0]));
+      });
+      break;
+    }
     case rpc::Opcode::TEST_INCREMENT: {
       port->recv_and_send([](rpc::Buffer *buffer) {
         reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -134,6 +134,7 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
 template <typename args_t>
 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
                            hsa_amd_memory_pool_t kernargs_pool,
+                           hsa_amd_memory_pool_t coarsegrained_pool,
                            hsa_queue_t *queue, const LaunchParameters &params,
                            const char *kernel_name, args_t kernel_args) {
   // Look up the '_start' kernel in the loaded executable.
@@ -142,6 +143,21 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
           executable, kernel_name, &dev_agent, &symbol))
     return err;
 
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr = nullptr;
+    if (hsa_status_t err =
+            hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                         /*flags=*/0, &dev_ptr))
+      handle_error(err);
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+    return dev_ptr;
+  };
+
+  auto deallocator = [](void *ptr) -> void {
+    if (hsa_status_t err = hsa_amd_memory_pool_free(ptr))
+      handle_error(err);
+  };
+
   // Retrieve different properties of the kernel symbol used for launch.
   uint64_t kernel;
   uint32_t args_size;
@@ -219,11 +235,11 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   while (hsa_signal_wait_scacquire(
              packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
@@ -366,14 +382,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_begin.kd", init_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_begin.kd", init_args))
     handle_error(err);
 
   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
-  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       queue, params, "_start.kd", args))
+  if (hsa_status_t err =
+          launch_kernel(dev_agent, executable, kernargs_pool,
+                        coarsegrained_pool, queue, params, "_start.kd", args))
     handle_error(err);
 
   // Create a memory signal and copy the return value back from the device into
@@ -402,9 +419,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   int ret = *static_cast<int *>(host_ret);
 
   end_args_t fini_args = {ret};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_end.kd", fini_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_end.kd", fini_args))
     handle_error(err);
 
   // Free the memory allocated for the device.

diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -174,6 +174,29 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
 
+  // Initialize a non-blocking CUDA stream to allocate memory if needed. This
+  // needs to be done on a separate stream or else it will deadlock with the
+  // executing kernel.
+  CUstream memory_stream;
+  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  auto allocator = [&](uint64_t size) -> void * {
+    CUdeviceptr dev_ptr;
+    if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
+      handle_error(err);
+
+    // Wait until the memory allocation is complete.
+    while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
+      ;
+    return reinterpret_cast<void *>(dev_ptr);
+  };
+  auto deallocator = [&](void *ptr) -> void {
+    if (CUresult err =
+            cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), memory_stream))
+      handle_error(err);
+  };
+
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
           function, params.num_blocks_x, params.num_blocks_y,
@@ -184,11 +207,11 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   // Wait until the kernel has completed execution on the device. Periodically
   // check the RPC client for work to be performed on the server.
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   return CUDA_SUCCESS;
 }