From 42a7a45c845de377b9b714af39a449fdc49eb768 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 21 Jun 2024 19:10:40 -0500
Subject: [PATCH] [libc] Implement (v|f)printf on the GPU

Summary:
This patch implements the `printf` family of functions on the GPU using
the new variadic support. This patch adapts the old handling in the
`rpc_fprintf` placeholder, but adds an extra RPC call to get the size of
the buffer to copy. This prevents the GPU from needing to parse the
string. While it's theoretically possible for the pass to know the size
of the struct, it's prohibitively difficult to do while maintaining ABI
compatibility with NVIDIA's varargs.

Depends on https://github.com/llvm/llvm-project/pull/96015.
---
 .../ClangLinkerWrapper.cpp                    |  1 +
 libc/config/gpu/entrypoints.txt               | 19 ++---
 libc/src/__support/arg_list.h                 |  3 +-
 libc/src/gpu/rpc_fprintf.cpp                  |  5 +-
 libc/src/stdio/CMakeLists.txt                 | 24 +-----
 libc/src/stdio/generic/CMakeLists.txt         | 25 +++++++
 libc/src/stdio/{ => generic}/fprintf.cpp      |  0
 libc/src/stdio/{ => generic}/vfprintf.cpp     |  0
 libc/src/stdio/gpu/CMakeLists.txt             | 48 ++++++++++++
 libc/src/stdio/gpu/fprintf.cpp                | 32 ++++++++
 libc/src/stdio/gpu/printf.cpp                 | 30 ++++++++
 libc/src/stdio/gpu/vfprintf.cpp               | 29 ++++++++
 libc/src/stdio/gpu/vfprintf_utils.h           | 73 +++++++++++++++++++
 libc/src/stdio/gpu/vprintf.cpp                | 28 +++++++
 .../integration/src/stdio/gpu/CMakeLists.txt  |  2 +-
 .../test/integration/src/stdio/gpu/printf.cpp | 43 ++++-------
 libc/utils/gpu/server/rpc_server.cpp          | 24 +++++-
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   |  8 +-
 18 files changed, 326 insertions(+), 68 deletions(-)
 rename libc/src/stdio/{ => generic}/fprintf.cpp (100%)
 rename libc/src/stdio/{ => generic}/vfprintf.cpp (100%)
 create mode 100644 libc/src/stdio/gpu/fprintf.cpp
 create mode 100644 libc/src/stdio/gpu/printf.cpp
 create mode 100644 libc/src/stdio/gpu/vfprintf.cpp
 create mode 100644 libc/src/stdio/gpu/vfprintf_utils.h
 create mode 100644 libc/src/stdio/gpu/vprintf.cpp

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index cdfe8cfbd9379..03fd23ae39c29 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1671,6 +1671,7 @@ int main(int Argc, char **Argv) {
     NewArgv.push_back(Arg->getValue());
   for (const opt::Arg *Arg : Args.filtered(OPT_offload_opt_eq_minus))
     NewArgv.push_back(Args.MakeArgString(StringRef("-") + Arg->getValue()));
+  llvm::errs() << "asdfasdf\n";
   cl::ParseCommandLineOptions(NewArgv.size(), &NewArgv[0]);
 
   Verbose = Args.hasArg(OPT_verbose);
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 2217a696fc5d1..de1ca6bfd151f 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(extra_entrypoints
-      # stdio.h entrypoints
-      libc.src.stdio.sprintf
-      libc.src.stdio.snprintf
-      libc.src.stdio.vsprintf
-      libc.src.stdio.vsnprintf
-  )
-endif()
-
 set(TARGET_LIBC_ENTRYPOINTS
     # assert.h entrypoints
     libc.src.assert.__assert_fail
@@ -185,7 +175,14 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.errno
 
     # stdio.h entrypoints
-    ${extra_entrypoints}
+    libc.src.stdio.printf
+    libc.src.stdio.vprintf
+    libc.src.stdio.fprintf
+    libc.src.stdio.vfprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.snprintf
+    libc.src.stdio.vsprintf
+    libc.src.stdio.vsnprintf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fseek
diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
index 0965e12afd562..3a4e5ad0fab3c 100644
--- a/libc/src/__support/arg_list.h
+++ b/libc/src/__support/arg_list.h
@@ -54,7 +54,8 @@ class MockArgList {
   }
 
   template <class T> LIBC_INLINE T next_var() {
-    ++arg_counter;
+    arg_counter =
+        ((arg_counter + alignof(T) - 1) / alignof(T)) * alignof(T) + sizeof(T);
     return T(arg_counter);
   }
 
diff --git a/libc/src/gpu/rpc_fprintf.cpp b/libc/src/gpu/rpc_fprintf.cpp
index 7b0e60b59baf3..659144d133004 100644
--- a/libc/src/gpu/rpc_fprintf.cpp
+++ b/libc/src/gpu/rpc_fprintf.cpp
@@ -29,6 +29,9 @@ int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
   }
 
   port.send_n(format, format_size);
+  port.recv([&](rpc::Buffer *buffer) {
+    args_size = static_cast<size_t>(buffer->data[0]);
+  });
   port.send_n(args, args_size);
 
   uint32_t ret = 0;
@@ -50,7 +53,7 @@ int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
   return ret;
 }
 
-// TODO: This is a stand-in function that uses a struct pointer and size in
+// TODO: Delete this and port OpenMP to use `printf`.
 // place of varargs. Once varargs support is added we will use that to
 // implement the real version.
 LLVM_LIBC_FUNCTION(int, rpc_fprintf,
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index a659d9e847a9e..3c536a287b2c4 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -159,17 +159,6 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.writer
 )
 
-add_entrypoint_object(
-  fprintf
-  SRCS
-    fprintf.cpp
-  HDRS
-    fprintf.h
-  DEPENDS
-    libc.src.__support.arg_list
-    libc.src.stdio.printf_core.vfprintf_internal
-)
-
 add_entrypoint_object(
   vsprintf
   SRCS
@@ -192,17 +181,6 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.writer
 )
 
-add_entrypoint_object(
-  vfprintf
-  SRCS
-    vfprintf.cpp
-  HDRS
-    vfprintf.h
-  DEPENDS
-    libc.src.__support.arg_list
-    libc.src.stdio.printf_core.vfprintf_internal
-)
-
 add_stdio_entrypoint_object(
   fileno
   SRCS
@@ -261,6 +239,7 @@ add_stdio_entrypoint_object(fputc)
 add_stdio_entrypoint_object(putc)
 add_stdio_entrypoint_object(putchar)
 add_stdio_entrypoint_object(printf)
+add_stdio_entrypoint_object(fprintf)
 add_stdio_entrypoint_object(fgetc)
 add_stdio_entrypoint_object(fgetc_unlocked)
 add_stdio_entrypoint_object(getc)
@@ -273,3 +252,4 @@ add_stdio_entrypoint_object(stdin)
 add_stdio_entrypoint_object(stdout)
 add_stdio_entrypoint_object(stderr)
 add_stdio_entrypoint_object(vprintf)
+add_stdio_entrypoint_object(vfprintf)
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 9cd4cfdae17f4..e0a1c577efcb6 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -396,6 +396,31 @@ add_entrypoint_object(
     ${printf_deps}
 )
 
+add_entrypoint_object(
+  fprintf
+  SRCS
+    fprintf.cpp
+  HDRS
+    ../fprintf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.vfprintf_internal
+    ${printf_deps}
+)
+
+add_entrypoint_object(
+  vfprintf
+  SRCS
+    vfprintf.cpp
+  HDRS
+    ../vfprintf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.vfprintf_internal
+    ${printf_deps}
+)
+
+
 add_entrypoint_object(
   fgets
   SRCS
diff --git a/libc/src/stdio/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
similarity index 100%
rename from libc/src/stdio/fprintf.cpp
rename to libc/src/stdio/generic/fprintf.cpp
diff --git a/libc/src/stdio/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
similarity index 100%
rename from libc/src/stdio/vfprintf.cpp
rename to libc/src/stdio/generic/vfprintf.cpp
diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
index 1b1e2a903cc0b..280e0d3f6b00d 100644
--- a/libc/src/stdio/gpu/CMakeLists.txt
+++ b/libc/src/stdio/gpu/CMakeLists.txt
@@ -10,6 +10,14 @@ add_header_library(
     .stderr
 )
 
+add_header_library(
+  vfprintf_utils
+  HDRS
+    vfprintf_utils.h
+  DEPENDS
+    .gpu_file
+)
+
 add_entrypoint_object(
   feof
   SRCS
@@ -262,6 +270,46 @@ add_entrypoint_object(
     .gpu_file
 )
 
+add_entrypoint_object(
+  printf
+  SRCS
+    printf.cpp
+  HDRS
+    ../printf.h
+  DEPENDS
+    .vfprintf_utils
+)
+
+add_entrypoint_object(
+  vprintf
+  SRCS
+    vprintf.cpp
+  HDRS
+    ../vprintf.h
+  DEPENDS
+    .vfprintf_utils
+)
+
+add_entrypoint_object(
+  fprintf
+  SRCS
+    fprintf.cpp
+  HDRS
+    ../fprintf.h
+  DEPENDS
+    .vfprintf_utils
+)
+
+add_entrypoint_object(
+  vfprintf
+  SRCS
+    vfprintf.cpp
+  HDRS
+    ../vfprintf.h
+  DEPENDS
+    .vfprintf_utils
+)
+
 add_entrypoint_object(
   stdin
   SRCS
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
new file mode 100644
index 0000000000000..adae0203180a4
--- /dev/null
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -0,0 +1,32 @@
+//===-- GPU Implementation of fprintf -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/fprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, fprintf,
+                   (::FILE *__restrict stream, const char *__restrict format,
+                    ...)) {
+  va_list vlist;
+  va_start(vlist, format);
+  cpp::string_view str_view(format);
+  int ret_val =
+      file::vfprintf_internal(stream, format, str_view.size() + 1, vlist);
+  va_end(vlist);
+  return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
new file mode 100644
index 0000000000000..44905f24bad7d
--- /dev/null
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -0,0 +1,30 @@
+//===-- GPU Implementation of printf --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/printf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
+  va_list vlist;
+  va_start(vlist, format);
+  cpp::string_view str_view(format);
+  int ret_val =
+      file::vfprintf_internal(stdout, format, str_view.size() + 1, vlist);
+  va_end(vlist);
+  return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vfprintf.cpp b/libc/src/stdio/gpu/vfprintf.cpp
new file mode 100644
index 0000000000000..2ec65d9afcb97
--- /dev/null
+++ b/libc/src/stdio/gpu/vfprintf.cpp
@@ -0,0 +1,29 @@
+//===-- GPU Implementation of vfprintf ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vfprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, vfprintf,
+                   (::FILE *__restrict stream, const char *__restrict format,
+                    va_list vlist)) {
+  cpp::string_view str_view(format);
+  int ret_val =
+      file::vfprintf_internal(stream, format, str_view.size() + 1, vlist);
+  return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
new file mode 100644
index 0000000000000..e1a3b97b35673
--- /dev/null
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -0,0 +1,73 @@
+//===--- GPU helper functions for printf using RPC ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/gpu/file.h"
+#include "src/string/string_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+namespace file {
+
+template <uint16_t opcode>
+LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
+                              const char *__restrict format, size_t format_size,
+                              va_list vlist) {
+  uint64_t mask = gpu::get_lane_mask();
+  rpc::Client::Port port = rpc::client.open<opcode>();
+
+  if constexpr (opcode == RPC_PRINTF_TO_STREAM) {
+    port.send([&](rpc::Buffer *buffer) {
+      buffer->data[0] = reinterpret_cast<uintptr_t>(file);
+    });
+  }
+
+  size_t args_size = 0;
+  port.send_n(format, format_size);
+  port.recv([&](rpc::Buffer *buffer) {
+    args_size = static_cast<size_t>(buffer->data[0]);
+  });
+  port.send_n(vlist, args_size);
+
+  uint32_t ret = 0;
+  for (;;) {
+    const char *str = nullptr;
+    port.recv([&](rpc::Buffer *buffer) {
+      ret = static_cast<uint32_t>(buffer->data[0]);
+      str = reinterpret_cast<const char *>(buffer->data[1]);
+    });
+    // If any lanes have a string argument it needs to be copied back.
+    if (!gpu::ballot(mask, str))
+      break;
+
+    uint64_t size = str ? internal::string_length(str) + 1 : 0;
+    port.send_n(str, size);
+  }
+
+  port.close();
+  return ret;
+}
+
+LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
+                                  const char *__restrict format,
+                                  size_t format_size, va_list vlist) {
+  if (stream == stdout)
+    return vfprintf_impl<RPC_PRINTF_TO_STDOUT>(stream, format, format_size,
+                                               vlist);
+  else if (stream == stderr)
+    return vfprintf_impl<RPC_PRINTF_TO_STDERR>(stream, format, format_size,
+                                               vlist);
+  else
+    return vfprintf_impl<RPC_PRINTF_TO_STREAM>(stream, format, format_size,
+                                               vlist);
+}
+
+} // namespace file
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/gpu/vprintf.cpp b/libc/src/stdio/gpu/vprintf.cpp
new file mode 100644
index 0000000000000..ee5b89a1a33d1
--- /dev/null
+++ b/libc/src/stdio/gpu/vprintf.cpp
@@ -0,0 +1,28 @@
+//===-- GPU Implementation of vprintf -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vprintf.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/errno/libc_errno.h"
+#include "src/stdio/gpu/vfprintf_utils.h"
+
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, vprintf,
+                   (const char *__restrict format, va_list vlist)) {
+  cpp::string_view str_view(format);
+  int ret_val =
+      file::vfprintf_internal(stdout, format, str_view.size() + 1, vlist);
+  return ret_val;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/integration/src/stdio/gpu/CMakeLists.txt b/libc/test/integration/src/stdio/gpu/CMakeLists.txt
index 6327c45e1ea5a..04fbd4706c556 100644
--- a/libc/test/integration/src/stdio/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdio/gpu/CMakeLists.txt
@@ -13,7 +13,7 @@ add_integration_test(
   SRCS
     printf.cpp
   DEPENDS
-    libc.src.gpu.rpc_fprintf
+    libc.src.stdio.fprintf
     libc.src.stdio.fopen
   LOADER_ARGS
     --threads 32
diff --git a/libc/test/integration/src/stdio/gpu/printf.cpp b/libc/test/integration/src/stdio/gpu/printf.cpp
index 97ad4ace1dcac..766c4f9439115 100644
--- a/libc/test/integration/src/stdio/gpu/printf.cpp
+++ b/libc/test/integration/src/stdio/gpu/printf.cpp
@@ -9,8 +9,8 @@
 #include "test/IntegrationTest/test.h"
 
 #include "src/__support/GPU/utils.h"
-#include "src/gpu/rpc_fprintf.h"
 #include "src/stdio/fopen.h"
+#include "src/stdio/fprintf.h"
 
 using namespace LIBC_NAMESPACE;
 
@@ -20,68 +20,51 @@ TEST_MAIN(int argc, char **argv, char **envp) {
   ASSERT_TRUE(file && "failed to open file");
   // Check basic printing.
   int written = 0;
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "A simple string\n", nullptr, 0);
+  written = LIBC_NAMESPACE::fprintf(file, "A simple string\n");
   ASSERT_EQ(written, 16);
 
   const char *str = "A simple string\n";
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%s", &str, sizeof(void *));
+  written = LIBC_NAMESPACE::fprintf(file, "%s", str);
   ASSERT_EQ(written, 16);
 
   // Check printing a different value with each thread.
   uint64_t thread_id = gpu::get_thread_id();
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%8ld\n", &thread_id,
-                                        sizeof(thread_id));
+  written = LIBC_NAMESPACE::fprintf(file, "%8ld\n", thread_id);
   ASSERT_EQ(written, 9);
 
-  struct {
-    uint32_t x = 1;
-    char c = 'c';
-    double f = 1.0;
-  } args1;
-  written =
-      LIBC_NAMESPACE::rpc_fprintf(file, "%d%c%.1f\n", &args1, sizeof(args1));
+  written = LIBC_NAMESPACE::fprintf(file, "%d%c%.1f\n", 1, 'c', 1.0);
   ASSERT_EQ(written, 6);
 
-  struct {
-    uint32_t x = 1;
-    const char *str = "A simple string\n";
-  } args2;
-  written =
-      LIBC_NAMESPACE::rpc_fprintf(file, "%032b%s\n", &args2, sizeof(args2));
+  written = LIBC_NAMESPACE::fprintf(file, "%032b%s\n", 1, "A simple string\n");
   ASSERT_EQ(written, 49);
 
   // Check that the server correctly handles divergent numbers of arguments.
   const char *format = gpu::get_thread_id() % 2 ? "%s" : "%20ld\n";
-  written = LIBC_NAMESPACE::rpc_fprintf(file, format, &str, sizeof(void *));
+  written = LIBC_NAMESPACE::fprintf(file, format, str);
   ASSERT_EQ(written, gpu::get_thread_id() % 2 ? 16 : 21);
 
   format = gpu::get_thread_id() % 2 ? "%s" : str;
-  written = LIBC_NAMESPACE::rpc_fprintf(file, format, &str, sizeof(void *));
+  written = LIBC_NAMESPACE::fprintf(file, format, str);
   ASSERT_EQ(written, 16);
 
   // Check that we handle null arguments correctly.
   struct {
     void *null = nullptr;
   } args3;
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%p", &args3, sizeof(args3));
+  written = LIBC_NAMESPACE::fprintf(file, "%p", nullptr);
   ASSERT_EQ(written, 9);
 
 #ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%s", &args3, sizeof(args3));
+  written = LIBC_NAMESPACE::fprintf(file, "%s", nullptr);
   ASSERT_EQ(written, 6);
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
   // Check for extremely abused variable width arguments
-  struct {
-    uint32_t x = 1;
-    uint32_t y = 2;
-    double f = 1.0;
-  } args4;
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%**d", &args4, sizeof(args4));
+  written = LIBC_NAMESPACE::fprintf(file, "%**d", 1, 2, 1.0);
   ASSERT_EQ(written, 4);
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%**d%6d", &args4, sizeof(args4));
+  written = LIBC_NAMESPACE::fprintf(file, "%**d%6d", 1, 2, 1.0);
   ASSERT_EQ(written, 10);
-  written = LIBC_NAMESPACE::rpc_fprintf(file, "%**.**f", &args4, sizeof(args4));
+  written = LIBC_NAMESPACE::fprintf(file, "%**.**f", 1, 2, 1.0);
   ASSERT_EQ(written, 7);
 
   return 0;
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 095f3fa13ffad..8813f67093010 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -44,7 +44,7 @@ template <uint32_t lane_size> void handle_printf(rpc::Server::Port &port) {
   // Get the appropriate output stream to use.
   if (port.get_opcode() == RPC_PRINTF_TO_STREAM)
     port.recv([&](rpc::Buffer *buffer, uint32_t id) {
-      files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
+      files[id] = file::to_stream(buffer->data[0]);
     });
   else if (port.get_opcode() == RPC_PRINTF_TO_STDOUT)
     std::fill(files, files + lane_size, stdout);
@@ -60,6 +60,28 @@ template <uint32_t lane_size> void handle_printf(rpc::Server::Port &port) {
   // Recieve the format string and arguments from the client.
   port.recv_n(format, format_sizes,
               [&](uint64_t size) { return new char[size]; });
+
+  // Parse the format string to get the expected size of the buffer.
+  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+    if (!format[lane])
+      continue;
+
+    WriteBuffer wb(nullptr, 0);
+    Writer writer(&wb);
+
+    internal::MockArgList printf_args;
+    Parser<internal::MockArgList &> parser(
+        reinterpret_cast<const char *>(format[lane]), printf_args);
+
+    for (FormatSection cur_section = parser.get_next_section();
+         !cur_section.raw_string.empty();
+         cur_section = parser.get_next_section())
+      ;
+    args_sizes[lane] = printf_args.read_count();
+  }
+  port.send([&](rpc::Buffer *buffer, uint32_t id) {
+    buffer->data[0] = args_sizes[id];
+  });
   port.recv_n(args, args_sizes, [&](uint64_t size) { return new char[size]; });
 
   // Identify any arguments that are actually pointers to strings on the client.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index d340bc041ccda..df70d7427e52c 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -934,7 +934,13 @@ struct Amdgpu final : public VariadicABIInfo {
   }
 
   VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
-    return {Align(4), false};
+    // NVPTX expects natural alignment in all cases. The variadic call ABI will
+    // handle promoting types to their appropriate size and alignment.
+    const unsigned MinAlign = 1;
+    Align A = DL.getABITypeAlign(Parameter);
+    if (A < MinAlign)
+      A = Align(MinAlign);
+    return {A, false};
   }
 };