Skip to content

Commit

Permalink
[libc] Add the '--threads' and '--blocks' option to the GPU loaders
Browse files Browse the repository at this point in the history
We will want to test the GPU `libc` with multiple threads in the future.
This patch adds the `--threads` and `--blocks` option to set the `x`
dimension of the kernel. Using CUDA terminology instead of OpenCL for
familiarity.

Depends on D148288 D148342

Reviewed By: jdoerfert, sivachandra, tra

Differential Revision: https://reviews.llvm.org/D148485
  • Loading branch information
jhuber6 committed Apr 19, 2023
1 parent 814dfb0 commit bc11bb3
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 19 deletions.
13 changes: 12 additions & 1 deletion libc/utils/gpu/loader/Loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,21 @@
#include <cstring>
#include <stddef.h>

/// Generic launch parameters for configuration the number of blocks / threads.
struct LaunchParameters {
uint32_t num_threads_x;
uint32_t num_threads_y;
uint32_t num_threads_z;
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
};

/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
int load(int argc, char **argv, char **evnp, void *image, size_t size);
int load(int argc, char **argv, char **evnp, void *image, size_t size,
const LaunchParameters &params);

/// Copy the system's argument vector to GPU memory allocated using \p alloc.
template <typename Allocator>
Expand Down
58 changes: 53 additions & 5 deletions libc/utils/gpu/loader/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,69 @@

#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>

int main(int argc, char **argv, char **envp) {
if (argc < 2) {
printf("USAGE: ./loader <device_image> <args>, ...\n");
printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
"<args>, ...\n");
return EXIT_SUCCESS;
}

// TODO: We should perform some validation on the file.
FILE *file = fopen(argv[1], "r");
int offset = 0;
FILE *file = nullptr;
char *ptr;
LaunchParameters params = {1, 1, 1, 1, 1, 1};
while (!file && ++offset < argc) {
if (argv[offset] == std::string("--threads") ||
argv[offset] == std::string("--threads-x")) {
params.num_threads_x =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else if (argv[offset] == std::string("--threads-y")) {
params.num_threads_y =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else if (argv[offset] == std::string("--threads-z")) {
params.num_threads_z =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else if (argv[offset] == std::string("--blocks") ||
argv[offset] == std::string("--blocks-x")) {
params.num_blocks_x =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else if (argv[offset] == std::string("--blocks-y")) {
params.num_blocks_y =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else if (argv[offset] == std::string("--blocks-z")) {
params.num_blocks_z =
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
} else {
file = fopen(argv[offset], "r");
if (!file) {
fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
return EXIT_FAILURE;
}
break;
}
}

if (!file) {
fprintf(stderr, "Failed to open image file %s\n", argv[1]);
fprintf(stderr, "No image file provided\n");
return EXIT_FAILURE;
}

// TODO: We should perform some validation on the file.
fseek(file, 0, SEEK_END);
const auto size = ftell(file);
fseek(file, 0, SEEK_SET);
Expand All @@ -39,7 +87,7 @@ int main(int argc, char **argv, char **envp) {
fclose(file);

// Drop the loader from the program arguments.
int ret = load(argc - 1, &argv[1], envp, image, size);
int ret = load(argc - offset, &argv[offset], envp, image, size, params);

free(image);
return ret;
Expand Down
19 changes: 11 additions & 8 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
return iterate_agent_memory_pools(agent, cb);
}

int load(int argc, char **argv, char **envp, void *image, size_t size) {
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
// Initialize the HSA runtime used to communicate with the device.
if (hsa_status_t err = hsa_init())
handle_error(err);
Expand Down Expand Up @@ -355,13 +356,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = 1;
packet->workgroup_size_y = 1;
packet->workgroup_size_z = 1;
packet->grid_size_x = 1;
packet->grid_size_y = 1;
packet->grid_size_z = 1;
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
(params.num_blocks_z * params.num_threads_z != 1))
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
Expand Down
11 changes: 6 additions & 5 deletions libc/utils/gpu/loader/nvptx/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ static void handle_error(const char *msg) {
exit(EXIT_FAILURE);
}

int load(int argc, char **argv, char **envp, void *image, size_t size) {
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
if (CUresult err = cuInit(0))
handle_error(err);

Expand Down Expand Up @@ -157,10 +158,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
server.reset(server_inbox, server_outbox, buffer);

// Call the kernel with the given arguments.
if (CUresult err =
cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
/*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
/*bloackDimZ=*/1, 0, stream, nullptr, args_config))
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically
Expand Down

0 comments on commit bc11bb3

Please sign in to comment.