Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libomptarget][nextgen-plugin] Use SCRELEASE/SCACQUIRE in packet header #85952

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

doru1004
Copy link
Contributor

This patch updates the construction of packet headers to replace the usage of ACQUIRE/RELEASE with SCACQUIRE/SCRELEASE which is now recommended.
The patch also ensures consistency between kernel dispatches.

@llvmbot
Copy link
Collaborator

llvmbot commented Mar 20, 2024

@llvm/pr-subscribers-libc

@llvm/pr-subscribers-backend-amdgpu

Author: Gheorghe-Teodor Bercea (doru1004)

Changes

This patch updates the construction of packet headers to replace the usage of ACQUIRE/RELEASE with SCACQUIRE/SCRELEASE which is now recommended.
The patch also ensures consistency between kernel dispatches.


Full diff: https://github.com/llvm/llvm-project/pull/85952.diff

3 Files Affected:

  • (modified) libc/utils/gpu/loader/amdgpu/Loader.cpp (+2-1)
  • (modified) openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h (+2-2)
  • (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+12-11)
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index e3911eda2bd82a..7fd45acfd47e97 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -276,7 +276,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
   uint32_t header_word = header | (setup << 16u);
-  __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
+  __atomic_store_n(reinterpret_cast<uint32_t *>(packet), header_word,
+                   __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
   // Wait until the kernel has completed execution on the device. Periodically
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 64a1d3308aed0b..0ac7661d7e0076 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -216,8 +216,8 @@ typedef enum { HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t;
 
 typedef enum {
   HSA_PACKET_HEADER_TYPE = 0,
-  HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
-  HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+  HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
+  HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11
 } hsa_packet_header_t;
 
 typedef enum {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fce7454bf2800d..c147cefe58e9b4 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -826,15 +826,15 @@ struct AMDGPUQueueTy {
   /// Assumes the queue lock is acquired.
   void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
                            hsa_kernel_dispatch_packet_t *Packet) {
-    uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
-
-    uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+    uint16_t Header =
+        (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 
     // Publish the packet. Do not modify the package after this point.
     uint32_t HeaderWord = Header | (Setup << 16u);
-    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
+    __atomic_store_n(reinterpret_cast<uint32_t *>(Packet), HeaderWord,
+                     __ATOMIC_RELEASE);
 
     // Signal the doorbell about the published packet.
     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
@@ -845,15 +845,16 @@ struct AMDGPUQueueTy {
   /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
   void publishBarrierPacket(uint64_t PacketId,
                             hsa_barrier_and_packet_t *Packet) {
-    uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
     uint16_t Setup = 0;
-    uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+    uint16_t Header =
+        (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
+        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 
     // Publish the packet. Do not modify the package after this point.
     uint32_t HeaderWord = Header | (Setup << 16u);
-    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
+    __atomic_store_n(reinterpret_cast<uint32_t *>(Packet), HeaderWord,
+                     __ATOMIC_RELEASE);
 
     // Signal the doorbell about the published packet.
     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);

Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see, dynamic HSA. I always forget about that.

@jhuber6
Copy link
Contributor

jhuber6 commented Mar 20, 2024

Personally I would've just fixed this with direct push that just adds these two enum members.

@doru1004
Copy link
Contributor Author

Ah, I see, dynamic HSA. I always forget about that.

Yep, had to revert the previous commit and had to open a new PR too.

@jhuber6
Copy link
Contributor

jhuber6 commented Mar 20, 2024

Ah, I see, dynamic HSA. I always forget about that.

Yep, had to revert the previous commit and had to open a new PR too.

Self-reviewed PRs muddy the waters. If you're going to self review just do a git push and be done with it, I think that's the LLVM policy.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants