Skip to content

Commit

Permalink
[libc] Implement basic malloc and free support on the GPU
Browse files Browse the repository at this point in the history
This patch adds support for the `malloc` and `free` functions. These
currently aren't implemented in-tree so we first add the interface
filies.

This patch provides the most basic support for a true `malloc` and
`free` by using the RPC interface. This is functional, but in the future
we will want to implement a more intelligent system and primarily use
the RPC interface more as a `brk()` or `sbrk()` interface only called
when absolutely necessary. We will need to design an intelligent
allocator in the future.

The semantics of these memory allocations will need to be checked. I am
somewhat iffy on the details. I've heard that HSA can allocate
asynchronously which seems to work with my tests at least. CUDA uses an
implicit synchronization scheme so we need to use an explicitly separate
stream from the one launching the kernel or the default stream. I will
need to test the NVPTX case.

I would appreciate if anyone more experienced with the implementation details
here could chime in for the HSA and CUDA cases.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D151735
  • Loading branch information
jhuber6 committed Jun 5, 2023
1 parent 30bd96f commit a621308
Show file tree
Hide file tree
Showing 13 changed files with 254 additions and 21 deletions.
8 changes: 5 additions & 3 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ enum Opcode : uint16_t {
NOOP = 0,
PRINT_TO_STDERR = 1,
EXIT = 2,
TEST_INCREMENT = 3,
TEST_INTERFACE = 4,
TEST_STREAM = 5,
MALLOC = 3,
FREE = 4,
TEST_INCREMENT = 5,
TEST_INTERFACE = 6,
TEST_STREAM = 7,
};

/// A fixed size channel used to communicate between the RPC client and server.
Expand Down
32 changes: 29 additions & 3 deletions libc/src/stdlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -287,10 +287,23 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
DEPENDS
${SCUDO_DEPS}
)
elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
add_entrypoint_external(
calloc
)
add_entrypoint_external(
realloc
)
add_entrypoint_external(
aligned_alloc
)
else()
add_entrypoint_external(
malloc
)
add_entrypoint_external(
free
)
add_entrypoint_external(
calloc
)
Expand All @@ -300,9 +313,6 @@ else()
add_entrypoint_external(
aligned_alloc
)
add_entrypoint_external(
free
)
endif()

if(NOT LLVM_LIBC_FULL_BUILD)
Expand Down Expand Up @@ -356,3 +366,19 @@ add_entrypoint_object(
DEPENDS
.${LIBC_TARGET_OS}.abort
)

if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
add_entrypoint_object(
malloc
ALIAS
DEPENDS
.${LIBC_TARGET_OS}.malloc
)

add_entrypoint_object(
free
ALIAS
DEPENDS
.${LIBC_TARGET_OS}.free
)
endif()
20 changes: 20 additions & 0 deletions libc/src/stdlib/free.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//===-- Implementation header for free --------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <stdlib.h>

#ifndef LLVM_LIBC_SRC_STDLIB_FREE_H
#define LLVM_LIBC_SRC_STDLIB_FREE_H

namespace __llvm_libc {

void free(void *ptr);

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
21 changes: 21 additions & 0 deletions libc/src/stdlib/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
add_entrypoint_object(
malloc
SRCS
malloc.cpp
HDRS
../malloc.h
DEPENDS
libc.include.stdlib
libc.src.__support.RPC.rpc_client
)

add_entrypoint_object(
free
SRCS
free.cpp
HDRS
../free.h
DEPENDS
libc.include.stdlib
libc.src.__support.RPC.rpc_client
)
23 changes: 23 additions & 0 deletions libc/src/stdlib/gpu/free.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//===-- GPU Implementation of free ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/stdlib/free.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"

namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void, free, (void *ptr)) {
rpc::Client::Port port = rpc::client.open<rpc::FREE>();
port.send([=](rpc::Buffer *buffer) {
buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
});
port.close();
}

} // namespace __llvm_libc
26 changes: 26 additions & 0 deletions libc/src/stdlib/gpu/malloc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//===-- GPU Implementation of malloc --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/stdlib/malloc.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"

namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
void *ptr = nullptr;
rpc::Client::Port port = rpc::client.open<rpc::MALLOC>();
port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
[&](rpc::Buffer *buffer) {
ptr = reinterpret_cast<void *>(buffer->data[0]);
});
port.close();
return ptr;
}

} // namespace __llvm_libc
20 changes: 20 additions & 0 deletions libc/src/stdlib/malloc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//===-- Implementation header for malloc ------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <stdlib.h>

#ifndef LLVM_LIBC_SRC_STDLIB_MALLOC_H
#define LLVM_LIBC_SRC_STDLIB_MALLOC_H

namespace __llvm_libc {

void *malloc(size_t size);

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
15 changes: 15 additions & 0 deletions libc/test/src/stdlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -320,4 +320,19 @@ if(LLVM_LIBC_FULL_BUILD)
libc.src.signal.raise
)

# Only the GPU has an in-tree 'malloc' implementation.
if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
add_libc_test(
malloc_test
HERMETIC_TEST_ONLY
SUITE
libc-stdlib-tests
SRCS
malloc_test.cpp
DEPENDS
libc.include.stdlib
libc.src.stdlib.malloc
libc.src.stdlib.free
)
endif()
endif()
19 changes: 19 additions & 0 deletions libc/test/src/stdlib/malloc_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//===-- Unittests for malloc ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/stdlib/free.h"
#include "src/stdlib/malloc.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcMallocTest, Allocate) {
int *ptr = reinterpret_cast<int *>(__llvm_libc::malloc(sizeof(int)));
EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
*ptr = 1;
EXPECT_EQ(*ptr, 1);
__llvm_libc::free(ptr);
}
11 changes: 9 additions & 2 deletions libc/utils/gpu/loader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,17 @@ endif()
find_package(CUDAToolkit QUIET)
# The CUDA loader requires LLVM to traverse the ELF image for symbols.
find_package(LLVM QUIET)
if(CUDAToolkit_FOUND AND LLVM_FOUND)
if(CUDAToolkit_FOUND AND LLVM_FOUND AND
${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.2")
add_subdirectory(nvptx)
else()
message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
if(${CUDAToolkit_VERSION} VERSION_LESS "11.2")
message(WARNING
"Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
Found CUDA Version ${CUDAToolkit_VERSION}")
else()
message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
endif()
endif()

# Add a custom target to be used for testing.
Expand Down
16 changes: 15 additions & 1 deletion libc/utils/gpu/loader/Server.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ static __llvm_libc::rpc::Server server;

/// Queries the RPC client at least once and performs server-side work if there
/// are any active requests.
void handle_server() {
template <typename Alloc, typename Dealloc>
void handle_server(Alloc allocator, Dealloc deallocator) {
using namespace __llvm_libc;

// Continue servicing the client until there is no work left and we return.
Expand Down Expand Up @@ -50,6 +51,19 @@ void handle_server() {
});
break;
}
case rpc::Opcode::MALLOC: {
port->recv_and_send([&](rpc::Buffer *buffer) {
buffer->data[0] =
reinterpret_cast<uintptr_t>(allocator(buffer->data[0]));
});
break;
}
case rpc::Opcode::FREE: {
port->recv([&](rpc::Buffer *buffer) {
deallocator(reinterpret_cast<void *>(buffer->data[0]));
});
break;
}
case rpc::Opcode::TEST_INCREMENT: {
port->recv_and_send([](rpc::Buffer *buffer) {
reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
Expand Down
37 changes: 27 additions & 10 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
template <typename args_t>
hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_amd_memory_pool_t kernargs_pool,
hsa_amd_memory_pool_t coarsegrained_pool,
hsa_queue_t *queue, const LaunchParameters &params,
const char *kernel_name, args_t kernel_args) {
// Look up the '_start' kernel in the loaded executable.
Expand All @@ -142,6 +143,21 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
executable, kernel_name, &dev_agent, &symbol))
return err;

auto allocator = [&](uint64_t size) -> void * {
void *dev_ptr = nullptr;
if (hsa_status_t err =
hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
/*flags=*/0, &dev_ptr))
handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
return dev_ptr;
};

auto deallocator = [](void *ptr) -> void {
if (hsa_status_t err = hsa_amd_memory_pool_free(ptr))
handle_error(err);
};

// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
Expand Down Expand Up @@ -219,11 +235,11 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
while (hsa_signal_wait_scacquire(
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
/*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
handle_server();
handle_server(allocator, deallocator);

// Handle the server one more time in case the kernel exited with a pending
// send still in flight.
handle_server();
handle_server(allocator, deallocator);

// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
Expand Down Expand Up @@ -366,14 +382,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool, queue,
single_threaded_params, "_begin.kd", init_args))
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
single_threaded_params, "_begin.kd", init_args))
handle_error(err);

start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
queue, params, "_start.kd", args))
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool,
coarsegrained_pool, queue, params, "_start.kd", args))
handle_error(err);

// Create a memory signal and copy the return value back from the device into
Expand Down Expand Up @@ -402,9 +419,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
int ret = *static_cast<int *>(host_ret);

end_args_t fini_args = {ret};
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool, queue,
single_threaded_params, "_end.kd", fini_args))
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
single_threaded_params, "_end.kd", fini_args))
handle_error(err);

// Free the memory allocated for the device.
Expand Down
27 changes: 25 additions & 2 deletions libc/utils/gpu/loader/nvptx/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,29 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};

// Initialize a non-blocking CUDA stream to allocate memory if needed. This
// needs to be done on a separate stream or else it will deadlock with the
// executing kernel.
CUstream memory_stream;
if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
handle_error(err);

auto allocator = [&](uint64_t size) -> void * {
CUdeviceptr dev_ptr;
if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
handle_error(err);

// Wait until the memory allocation is complete.
while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
;
return reinterpret_cast<void *>(dev_ptr);
};
auto deallocator = [&](void *ptr) -> void {
if (CUresult err =
cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), memory_stream))
handle_error(err);
};

// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
Expand All @@ -184,11 +207,11 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server();
handle_server(allocator, deallocator);

// Handle the server one more time in case the kernel exited with a pending
// send still in flight.
handle_server();
handle_server(allocator, deallocator);

return CUDA_SUCCESS;
}
Expand Down

0 comments on commit a621308

Please sign in to comment.