Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libc][NFC] Remove inline assembly for PTX instructions #79913

Merged
merged 1 commit into from
Jan 30, 2024

Conversation

jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Jan 29, 2024

Summary:
Recent patches have implemented builitin versions of these functions.
This patch simply removes uses of inline assembly to hopefully improve
optimizations in this area.

Summary:
Recent patches have implemented builitin versions of these functions.
This patch simply removes uses of inline assembly to hopefully improve
optimizations in this area.
@llvmbot
Copy link
Collaborator

llvmbot commented Jan 29, 2024

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

Changes

Summary:
Recent patches have implemented builitin versions of these functions.
This patch simply removes uses of inline assembly to hopefully improve
optimizations in this area.


Full diff: https://github.com/llvm/llvm-project/pull/79913.diff

3 Files Affected:

  • (modified) libc/src/__support/GPU/nvptx/utils.h (+4-13)
  • (modified) libc/src/__support/RPC/rpc_util.h (+1-1)
  • (modified) libc/src/time/gpu/nanosleep.cpp (+1-1)
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 1519f36850a63..9fe3caa491475 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -105,9 +105,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 
 /// Returns the bit-mask of active threads in the current warp.
 [[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  uint32_t mask;
-  LIBC_INLINE_ASM("activemask.b32 %0;" : "=r"(mask));
-  return mask;
+  return __nvvm_activemask();
 }
 
 /// Copies the value from the first active thread in the warp to the rest.
@@ -141,23 +139,16 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 
 /// Returns the current value of the GPU's processor clock.
 LIBC_INLINE uint64_t processor_clock() {
-  uint64_t timestamp;
-  LIBC_INLINE_ASM("mov.u64  %0, %%clock64;" : "=l"(timestamp));
-  return timestamp;
+  return __nvvm_read_ptx_sreg_clock64();
 }
 
 /// Returns a global fixed-frequency timer at nanosecond frequency.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  uint64_t nsecs;
-  LIBC_INLINE_ASM("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return nsecs;
+  return __nvvm_read_ptx_sreg_globaltimer();
 }
 
 /// Terminates execution of the calling thread.
-[[noreturn]] LIBC_INLINE void end_program() {
-  LIBC_INLINE_ASM("exit;" ::: "memory");
-  __builtin_unreachable();
-}
+[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
 
 } // namespace gpu
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 04620b0487f4a..ff9569298a1ed 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -22,7 +22,7 @@ namespace rpc {
 /// Suspend the thread briefly to assist the thread scheduler during busy loops.
 LIBC_INLINE void sleep_briefly() {
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
-  LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
+  __nvvm_nanosleep(64);
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
   __builtin_amdgcn_s_sleep(2);
 #elif defined(LIBC_TARGET_ARCH_IS_X86)
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index a0c735502ff58..e84fe622100e8 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
   // slept will be somewhere between zero and twice the requested amount. Here
   // we will sleep again if we undershot the time.
   while (cur < end) {
-    LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs));
+    __nvvm_nanosleep(static_cast<uint32_t>(nsecs));
     cur = gpu::fixed_frequency_clock();
     nsecs -= nsecs > cur - start ? cur - start : 0;
   }

@jhuber6 jhuber6 merged commit d29bb70 into llvm:main Jan 30, 2024
5 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants