-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Implement (v|f)printf on the GPU #96369
Conversation
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: Depends on #96015. Patch is 20.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96369.diff 18 Files Affected:
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index cdfe8cfbd9379..03fd23ae39c29 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1671,6 +1671,7 @@ int main(int Argc, char **Argv) {
NewArgv.push_back(Arg->getValue());
for (const opt::Arg *Arg : Args.filtered(OPT_offload_opt_eq_minus))
NewArgv.push_back(Args.MakeArgString(StringRef("-") + Arg->getValue()));
+ llvm::errs() << "asdfasdf\n";
cl::ParseCommandLineOptions(NewArgv.size(), &NewArgv[0]);
Verbose = Args.hasArg(OPT_verbose);
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 2217a696fc5d1..de1ca6bfd151f 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
- set(extra_entrypoints
- # stdio.h entrypoints
- libc.src.stdio.sprintf
- libc.src.stdio.snprintf
- libc.src.stdio.vsprintf
- libc.src.stdio.vsnprintf
- )
-endif()
-
set(TARGET_LIBC_ENTRYPOINTS
# assert.h entrypoints
libc.src.assert.__assert_fail
@@ -185,7 +175,14 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.errno.errno
# stdio.h entrypoints
- ${extra_entrypoints}
+ libc.src.stdio.printf
+ libc.src.stdio.vprintf
+ libc.src.stdio.fprintf
+ libc.src.stdio.vfprintf
+ libc.src.stdio.sprintf
+ libc.src.stdio.snprintf
+ libc.src.stdio.vsprintf
+ libc.src.stdio.vsnprintf
libc.src.stdio.feof
libc.src.stdio.ferror
libc.src.stdio.fseek
diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
index 0965e12afd562..3a4e5ad0fab3c 100644
--- a/libc/src/__support/arg_list.h
+++ b/libc/src/__support/arg_list.h
@@ -54,7 +54,8 @@ class MockArgList {
}
template <class T> LIBC_INLINE T next_var() {
- ++arg_counter;
+ arg_counter =
+ ((arg_counter + alignof(T) - 1) / alignof(T)) * alignof(T) + sizeof(T);
return T(arg_counter);
}
diff --git a/libc/src/gpu/rpc_fprintf.cpp b/libc/src/gpu/rpc_fprintf.cpp
index 7b0e60b59baf3..659144d133004 100644
--- a/libc/src/gpu/rpc_fprintf.cpp
+++ b/libc/src/gpu/rpc_fprintf.cpp
@@ -29,6 +29,9 @@ int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
}
port.send_n(format, format_size);
+ port.recv([&](rpc::Buffer *buffer) {
+ args_size = static_cast<size_t>(buffer->data[0]);
+ });
port.send_n(args, args_size);
uint32_t ret = 0;
@@ -50,7 +53,7 @@ int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
return ret;
}
-// TODO: This is a stand-in function that uses a struct pointer and size in
+// TODO: Delete this and port OpenMP to use `printf`.
// place of varargs. Once varargs support is added we will use that to
// implement the real version.
LLVM_LIBC_FUNCTION(int, rpc_fprintf,
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index a659d9e847a9e..3c536a287b2c4 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -159,17 +159,6 @@ add_entrypoint_object(
libc.src.stdio.printf_core.writer
)
-add_entrypoint_object(
- fprintf
- SRCS
- fprintf.cpp
- HDRS
- fprintf.h
- DEPENDS
- libc.src.__support.arg_list
- libc.src.stdio.printf_core.vfprintf_internal
-)
-
add_entrypoint_object(
vsprintf
SRCS
@@ -192,17 +181,6 @@ add_entrypoint_object(
libc.src.stdio.printf_core.writer
)
-add_entrypoint_object(
- vfprintf
- SRCS
- vfprintf.cpp
- HDRS
- vfprintf.h
- DEPENDS
- libc.src.__support.arg_list
- libc.src.stdio.printf_core.vfprintf_internal
-)
-
add_stdio_entrypoint_object(
fileno
SRCS
@@ -261,6 +239,7 @@ add_stdio_entrypoint_object(fputc)
add_stdio_entrypoint_object(putc)
add_stdio_entrypoint_object(putchar)
add_stdio_entrypoint_object(printf)
+add_stdio_entrypoint_object(fprintf)
add_stdio_entrypoint_object(fgetc)
add_stdio_entrypoint_object(fgetc_unlocked)
add_stdio_entrypoint_object(getc)
@@ -273,3 +252,4 @@ add_stdio_entrypoint_object(stdin)
add_stdio_entrypoint_object(stdout)
add_stdio_entrypoint_object(stderr)
add_stdio_entrypoint_object(vprintf)
+add_stdio_entrypoint_object(vfprintf)
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 9cd4cfdae17f4..e0a1c577efcb6 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -396,6 +396,31 @@ add_entrypoint_object(
${printf_deps}
)
+add_entrypoint_object(
+ fprintf
+ SRCS
+ fprintf.cpp
+ HDRS
+ ../fprintf.h
+ DEPENDS
+ libc.src.__support.arg_list
+ libc.src.stdio.printf_core.vfprintf_internal
+ ${printf_deps}
+)
+
+add_entrypoint_object(
+ vfprintf
+ SRCS
+ vfprintf.cpp
+ HDRS
+ ../vfprintf.h
+ DEPENDS
+ libc.src.__support.arg_list
+ libc.src.stdio.printf_core.vfprintf_internal
+ ${printf_deps}
+)
+
+
add_entrypoint_object(
fgets
SRCS
diff --git a/libc/src/stdio/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
similarity index 100%
rename from libc/src/stdio/fprintf.cpp
rename to libc/src/stdio/generic/fprintf.cpp
diff --git a/libc/src/stdio/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
similarity index 100%
rename from libc/src/stdio/vfprintf.cpp
rename to libc/src/stdio/generic/vfprintf.cpp
diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
index 1b1e2a903cc0b..280e0d3f6b00d 100644
--- a/libc/src/stdio/gpu/CMakeLists.txt
+++ b/libc/src/stdio/gpu/CMakeLists.txt
@@ -10,6 +10,14 @@ add_header_library(
.stderr
)
+add_header_library(
+ vfprintf_utils
+ HDRS
+ vfprintf_utils.h
+ DEPENDS
+ .gpu_file
+)
+
add_entrypoint_object(
feof
SRCS
@@ -262,6 +270,46 @@ add_entrypoint_object(
.gpu_file
)
+add_entrypoint_object(
+ printf
+ SRCS
+ printf.cpp
+ HDRS
+ ../printf.h
+ DEPENDS
+ .vfprintf_utils
+)
+
+add_entrypoint_object(
+ vprintf
+ SRCS
+ vprintf.cpp
+ HDRS
+ ../vprintf.h
+ DEPENDS
+ .vfprintf_utils
+)
+
+add_entrypoint_object(
+ fprintf
+ SRCS
+ fprintf.cpp
+ HDRS
+ ../fprintf.h
+ DEPENDS
+ .vfprintf_utils
+)
+
+add_entrypoint_object(
+ vfprintf
+ SRCS
+ vfprintf.cpp
+ HDRS
+ ../vfprintf.h
+ DEPENDS
+ .vfprintf_utils
+)
+
add_entrypoint_object(
stdin
SRCS
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
new file mode 100644
index 0000000000000..adae0203180a4
--- /dev/null
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -0,0 +1,32 @@
+//===-- GPU Implementation of fprintf -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/fprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, fprintf,
+ (::FILE *__restrict stream, const char *__restrict format,
+ ...)) {
+ va_list vlist;
+ va_start(vlist, format);
+ cpp::string_view str_view(format);
+ int ret_val =
+ file::vfprintf_internal(stream, format, str_view.size() + 1, vlist);
+ va_end(vlist);
+ return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
new file mode 100644
index 0000000000000..44905f24bad7d
--- /dev/null
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -0,0 +1,30 @@
+//===-- GPU Implementation of printf --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/printf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
+ va_list vlist;
+ va_start(vlist, format);
+ cpp::string_view str_view(format);
+ int ret_val =
+ file::vfprintf_internal(stdout, format, str_view.size() + 1, vlist);
+ va_end(vlist);
+ return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vfprintf.cpp b/libc/src/stdio/gpu/vfprintf.cpp
new file mode 100644
index 0000000000000..2ec65d9afcb97
--- /dev/null
+++ b/libc/src/stdio/gpu/vfprintf.cpp
@@ -0,0 +1,29 @@
+//===-- GPU Implementation of vfprintf ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vfprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, vfprintf,
+ (::FILE *__restrict stream, const char *__restrict format,
+ va_list vlist)) {
+ cpp::string_view str_view(format);
+ int ret_val =
+ file::vfprintf_internal(stream, format, str_view.size() + 1, vlist);
+ return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
new file mode 100644
index 0000000000000..e1a3b97b35673
--- /dev/null
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -0,0 +1,73 @@
+//===--- GPU helper functions for printf using RPC ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/gpu/file.h"
+#include "src/string/string_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+namespace file {
+
+template <uint16_t opcode>
+LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
+ const char *__restrict format, size_t format_size,
+ va_list vlist) {
+ uint64_t mask = gpu::get_lane_mask();
+ rpc::Client::Port port = rpc::client.open<opcode>();
+
+ if constexpr (opcode == RPC_PRINTF_TO_STREAM) {
+ port.send([&](rpc::Buffer *buffer) {
+ buffer->data[0] = reinterpret_cast<uintptr_t>(file);
+ });
+ }
+
+ size_t args_size = 0;
+ port.send_n(format, format_size);
+ port.recv([&](rpc::Buffer *buffer) {
+ args_size = static_cast<size_t>(buffer->data[0]);
+ });
+ port.send_n(vlist, args_size);
+
+ uint32_t ret = 0;
+ for (;;) {
+ const char *str = nullptr;
+ port.recv([&](rpc::Buffer *buffer) {
+ ret = static_cast<uint32_t>(buffer->data[0]);
+ str = reinterpret_cast<const char *>(buffer->data[1]);
+ });
+ // If any lanes have a string argument it needs to be copied back.
+ if (!gpu::ballot(mask, str))
+ break;
+
+ uint64_t size = str ? internal::string_length(str) + 1 : 0;
+ port.send_n(str, size);
+ }
+
+ port.close();
+ return ret;
+}
+
+LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
+ const char *__restrict format,
+ size_t format_size, va_list vlist) {
+ if (stream == stdout)
+ return vfprintf_impl<RPC_PRINTF_TO_STDOUT>(stream, format, format_size,
+ vlist);
+ else if (stream == stderr)
+ return vfprintf_impl<RPC_PRINTF_TO_STDERR>(stream, format, format_size,
+ vlist);
+ else
+ return vfprintf_impl<RPC_PRINTF_TO_STREAM>(stream, format, format_size,
+ vlist);
+}
+
+} // namespace file
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vprintf.cpp b/libc/src/stdio/gpu/vprintf.cpp
new file mode 100644
index 0000000000000..ee5b89a1a33d1
--- /dev/null
+++ b/libc/src/stdio/gpu/vprintf.cpp
@@ -0,0 +1,28 @@
+//===-- GPU Implementation of vprintf -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, vprintf,
+ (const char *__restrict format, va_list vlist)) {
+ cpp::string_view str_view(format);
+ int ret_val =
+ file::vfprintf_internal(stdout, format, str_view.size() + 1, vlist);
+ return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/integration/src/stdio/gpu/CMakeLists.txt b/libc/test/integration/src/stdio/gpu/CMakeLists.txt
index 6327c45e1ea5a..04fbd4706c556 100644
--- a/libc/test/integration/src/stdio/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdio/gpu/CMakeLists.txt
@@ -13,7 +13,7 @@ add_integration_test(
SRCS
printf.cpp
DEPENDS
- libc.src.gpu.rpc_fprintf
+ libc.src.stdio.fprintf
libc.src.stdio.fopen
LOADER_ARGS
--threads 32
diff --git a/libc/test/integration/src/stdio/gpu/printf.cpp b/libc/test/integration/src/stdio/gpu/printf.cpp
index 97ad4ace1dcac..766c4f9439115 100644
--- a/libc/test/integration/src/stdio/gpu/printf.cpp
+++ b/libc/test/integration/src/stdio/gpu/printf.cpp
@@ -9,8 +9,8 @@
#include "test/IntegrationTest/test.h"
#include "src/__support/GPU/utils.h"
-#include "src/gpu/rpc_fprintf.h"
#include "src/stdio/fopen.h"
+#include "src/stdio/fprintf.h"
using namespace LIBC_NAMESPACE;
@@ -20,68 +20,51 @@ TEST_MAIN(int argc, char **argv, char **envp) {
ASSERT_TRUE(file && "failed to open file");
// Check basic printing.
int written = 0;
- written = LIBC_NAMESPACE::rpc_fprintf(file, "A simple string\n", nullptr, 0);
+ written = LIBC_NAMESPACE::fprintf(file, "A simple string\n");
ASSERT_EQ(written, 16);
const char *str = "A simple string\n";
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%s", &str, sizeof(void *));
+ written = LIBC_NAMESPACE::fprintf(file, "%s", str);
ASSERT_EQ(written, 16);
// Check printing a different value with each thread.
uint64_t thread_id = gpu::get_thread_id();
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%8ld\n", &thread_id,
- sizeof(thread_id));
+ written = LIBC_NAMESPACE::fprintf(file, "%8ld\n", thread_id);
ASSERT_EQ(written, 9);
- struct {
- uint32_t x = 1;
- char c = 'c';
- double f = 1.0;
- } args1;
- written =
- LIBC_NAMESPACE::rpc_fprintf(file, "%d%c%.1f\n", &args1, sizeof(args1));
+ written = LIBC_NAMESPACE::fprintf(file, "%d%c%.1f\n", 1, 'c', 1.0);
ASSERT_EQ(written, 6);
- struct {
- uint32_t x = 1;
- const char *str = "A simple string\n";
- } args2;
- written =
- LIBC_NAMESPACE::rpc_fprintf(file, "%032b%s\n", &args2, sizeof(args2));
+ written = LIBC_NAMESPACE::fprintf(file, "%032b%s\n", 1, "A simple string\n");
ASSERT_EQ(written, 49);
// Check that the server correctly handles divergent numbers of arguments.
const char *format = gpu::get_thread_id() % 2 ? "%s" : "%20ld\n";
- written = LIBC_NAMESPACE::rpc_fprintf(file, format, &str, sizeof(void *));
+ written = LIBC_NAMESPACE::fprintf(file, format, str);
ASSERT_EQ(written, gpu::get_thread_id() % 2 ? 16 : 21);
format = gpu::get_thread_id() % 2 ? "%s" : str;
- written = LIBC_NAMESPACE::rpc_fprintf(file, format, &str, sizeof(void *));
+ written = LIBC_NAMESPACE::fprintf(file, format, str);
ASSERT_EQ(written, 16);
// Check that we handle null arguments correctly.
struct {
void *null = nullptr;
} args3;
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%p", &args3, sizeof(args3));
+ written = LIBC_NAMESPACE::fprintf(file, "%p", nullptr);
ASSERT_EQ(written, 9);
#ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%s", &args3, sizeof(args3));
+ written = LIBC_NAMESPACE::fprintf(file, "%s", nullptr);
ASSERT_EQ(written, 6);
#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
// Check for extremely abused variable width arguments
- struct {
- uint32_t x = 1;
- uint32_t y = 2;
- double f = 1.0;
- } args4;
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%**d", &args4, sizeof(args4));
+ written = LIBC_NAMESPACE::fprintf(file, "%**d", 1, 2, 1.0);
ASSERT_EQ(written, 4);
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%**d%6d", &args4, sizeof(args4));
+ written = LIBC_NAMESPACE::fprintf(file, "%**d%6d", 1, 2, 1.0);
ASSERT_EQ(written, 10);
- written = LIBC_NAMESPACE::rpc_fprintf(file, "%**.**f", &args4, sizeof(args4));
+ written = LIBC_NAMESPACE::fprintf(file, "%**.**f", 1, 2, 1.0);
ASSERT_EQ(written, 7);
return 0;
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 095f3fa13ffad..8813f67093010 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -44,7 +44,7 @@ template <uint32_t lane_size> void handle_printf(rpc::Server::Port &port) {
// Get the appropriate output stream to use.
if (port.get_opcode() == RPC_PRINTF_TO_STREAM)
port.recv([&](rpc::Buffer *buffer, uint32_t id) {
- files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
+ files[id] = file::to_stream(buffer->data[0]);
});
else if (port.get_opcode() == RPC_PRINTF_TO_STDOUT)
std::fill(files, files + lane_size, stdout);
@@ -60,6 +60,28 @@ template <uint32_t lane_size> void handle_printf(rpc::Server::Port &port) {
// Recieve the format string and arguments from the client.
port.recv_n(format, format_sizes,
[&](uint64_t size) { return new char[size]; });
+
+ // Parse the format string to get the expected size of the buffer.
+ for (uint32_t lane = 0; lane < lane_size; ++lane) {
+ if (!format[lane])
+ continue;
+
+ WriteBuffer wb(nullptr, 0);
+ Writer writer(&wb);
+
+ internal::MockArgList printf_args;
+ Parser<internal::MockArgList &> parser(
+ reinterpret_cast<const char *>(format[lane]), printf_args);
+
+ for (FormatSection cur_section = parser.get_next_section();
+ !cur_section.raw_string.empty();
+ cur_section = parser.get_next_section())
+ ;
+ args_sizes[lane] = printf_args.read_count();
+ }
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
+ buffer->data[0] = args_sizes[id];
+ });
port.recv_n(args, args_sizes, [&](uint64_t size) { return new char[size]; });
// Identify any arguments that are actually pointers to strings on the client.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index d340bc041ccda..df70d7427e52c 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -934,7 +934,13 @@ struct Amdgpu final : public VariadicABIInfo {
}
VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
- return {Align(4), false};
+ // NVPTX expects natural alignment in all cases. The variadic call ABI will
+ // handle promoting types to their appropriate size and alignment.
+ const unsigned MinAlign = 1;
+ Align A = DL.getABITypeAlign(Parameter);
+ if (A < MinAlign)
+ A = Align(MinAlign);
+ return {A, false};
}
}...
[truncated]
|
@@ -1671,6 +1671,7 @@ int main(int Argc, char **Argv) { | |||
NewArgv.push_back(Arg->getValue()); | |||
for (const opt::Arg *Arg : Args.filtered(OPT_offload_opt_eq_minus)) | |||
NewArgv.push_back(Args.MakeArgString(StringRef("-") + Arg->getValue())); | |||
llvm::errs() << "asdfasdf\n"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Leftover debug print
Do we want some sort of optimization for constant printf? 99% of the time, we could parse the string at compile-time. (This sort of optimization is common for embedded targets.) If the format string isn't constant, is parsing the string on the GPU really slower than asking the host for the size? printf format strings aren't that complicated, especially if you aren't actually doing the formatting. Does this support |
I was going to make a follow-up patch that simply skipped sending back the size if there were no arguments to parse. If we enable these builtins as available on the GPU (Which I may very soon) we will also get
Well, this approach basically trades speed for resource usage. I had an old implementation that did the parsing on the GPU side, (https://reviews.llvm.org/D158774), and that had an unfortunate amount of registers used when the arguments are truly just an array in this sense. Plus, since I wrote that there's been a lot more added to the format parsing, since I think future C releases are supposed to support 128 bit integers or something. I think my old version used something like 54 SGPRs and 40 VGPRs while this version is
No, I specifically disabled it in the |
Also, I just merged the prerequisite patches into this, to get the relevant changed just look at the most recent commit. The lack of stacked PRs in GitHub really irks me. |
Yes! @michaelrj-google has some ideas. Orthogonal to this PR though. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought I submitted these comments already, oops
clang/test/CodeGen/variadic-nvptx.c
Outdated
// CHECK-NEXT: store float 1.000000e+00, ptr [[F]], align 4 | ||
// CHECK-NEXT: store double 1.000000e+00, ptr [[D]], align 8 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[C]], align 1 | ||
// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i8 and i16 are converted to i32 then passed to the vaarg func. Is that expected? Shouldn't they be passed as they are?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
never mind. it is expected by C
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, it seems that Jon's original patch did that on purpose to try to save space? But it seems weird since it breaks struct padding / alignment. That's from a separate patch I just included here to make it easier. The lack of stacked PRs is annoying. See #96370
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
C promotes them to i32. C has a lot of rules around vararg type promotion that have not aged brilliantly.
If you want a i8 or i16, put it in a struct. C doesn't say anything about promoting that and amdgpu will pass it inlined into the struct, i.e. with no overhead. I believe nvptx will do likewise.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM from the libc side
// NVPTX expects natural alignment in all cases. The variadic call ABI will | ||
// handle promoting types to their appropriate size and alignment. | ||
const unsigned MinAlign = 1; | ||
Align A = DL.getABITypeAlign(Parameter); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can getABITypeAlign return 0?
Does nvptx actually expect natural alignment? That would be inconsistent with the slot size of four which strongly suggests everything is passed with at least four byte alignment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should use Align instead of unsigned for alignment values
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so, NVPTX uses structs with normal padding, see https://godbolt.org/z/1v54YY6d3.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suspect Matt means Align MinAlign = 1;
instead of unsigned MinAlign = 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The amdgpu patch is incorrect, see #96370
The nvptx lowering looks dubious - values smaller than slot size should be passed with the same alignment as the slot and presently aren't. A struct containing i8, i16 or half should be miscompiled on nvptx as written.
No comment on the libc part.
I mentioned this in the original patch, it's correct as far as I know. NVPTX does not require nested structs to have slot alignment, which means that the minimum alignment is exactly the type. The C ABI helps us here by making arguments passed directly all get type promoted to |
The IR has its own ABI that may or may not match whatever the platform "C ABI' is. Especially given the lack of a formal platform ABI specification, I would not characterize not using the C ABI as misuse or against the ABI |
The ABI in this case is what NVIDIA does, figuring out whether or not an argument came from a struct or was passed directly would be a nightmare of metadata nodes so I really don't want to go down that path since there's little practical benefit. |
I've passed some types to nvcc on godbolt and tried to decode the results. It looks like it's passing everything with natural alignment, flattened, with total disregard to the minimum slot size premise clang uses. |
clang/lib/CodeGen/Targets/NVPTX.cpp
Outdated
llvm_unreachable("NVPTX does not support varargs"); | ||
return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*IsIndirect=*/false, | ||
getContext().getTypeInfoInChars(Ty), | ||
CharUnits::fromQuantity(4), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Error is here - this says slots shall be at least four bytes in size, but nvcc looks happy to pass struct {char} right next to other things, so we're looking for CharUnits::fromQuantity(1),
if (A < MinAlign) | ||
A = Align(MinAlign); | ||
return {A, false}; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you really looking for if (A < 1) A = 1;
here? Align has max/min functions which would be clearer
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this was left over, since it's just 1
now I can get rid of it.
|
||
template <class T> LIBC_INLINE T next_var() { | ||
arg_counter = | ||
((arg_counter + alignof(T) - 1) / alignof(T)) * alignof(T) + sizeof(T); | ||
return T(arg_counter); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe split this into separate increment by size and round for the alignment operations. There might be helper functions for the rounding which would be clearer than reading the division / multiplication and trying to pattern match on what alignment code usually looks like (e.g. I expected to see masking off bits so tripped over this)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's also __builtin_align_up and __builtin_align_down
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting, didn't know about that one. Doesn't seem like GCC supports it, and this is one of the files that gcc
might compile so it's probably easier to keep it in a utility for now. Maybe I could do __has_builtin
for clang.
clang/lib/Basic/Targets/NVPTX.h
Outdated
@@ -116,8 +116,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { | |||
} | |||
|
|||
BuiltinVaListKind getBuiltinVaListKind() const override { | |||
// FIXME: implement | |||
return TargetInfo::CharPtrBuiltinVaList; | |||
return TargetInfo::VoidPtrBuiltinVaList; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These should be the same as far as codegen is concerned I think, in which case we should probably leave it unchanged. Or is there a reason to change it I'm missing?
Summary: This patch implements the `printf` family of functions on the GPU using the new variadic support. This patch adapts the old handling in the `rpc_fprintf` placeholder, but adds an extra RPC call to get the size of the buffer to copy. This prevents the GPU from needing to parse the string. While it's theoretically possible for the pass to know the size of the struct, it's prohibitively difficult to do while maintaining ABI compatibility with NVIDIA's varargs. Depends on llvm#96015.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/43/builds/2028 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/147/builds/1964 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/71/builds/2057 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/131/builds/2021 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/179/builds/2018 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/93/builds/1984 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/171/builds/1995 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/188/builds/1427 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/2859 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/78/builds/1866 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/183/builds/1291 Here is the relevant piece of the build log for the reference:
|
Summary: This patch implements the `printf` family of functions on the GPU using the new variadic support. This patch adapts the old handling in the `rpc_fprintf` placeholder, but adds an extra RPC call to get the size of the buffer to copy. This prevents the GPU from needing to parse the string. While it's theoretically possible for the pass to know the size of the struct, it's prohibitively difficult to do while maintaining ABI compatibility with NVIDIA's varargs. Depends on llvm#96015.
Summary:
This patch implements the
printf
family of functions on the GPU usingthe new variadic support. This patch adapts the old handling in the
rpc_fprintf
placeholder, but adds an extra RPC call to get the size ofthe buffer to copy. This prevents the GPU from needing to parse the
string. While it's theoretically possible for the pass to know the size
of the struct, it's prohibitively difficult to do while maintaining ABI
compatibility with NVIDIA's varargs.
Depends on #96015.