New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add builtin support for 'globaltimer' #79765
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-llvm-ir Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/79765.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 0f2e8260143be78..57a229ded49f886 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -146,6 +146,7 @@ BUILTIN(__nvvm_read_ptx_sreg_lanemask_gt, "i", "nc")
BUILTIN(__nvvm_read_ptx_sreg_clock, "i", "n")
BUILTIN(__nvvm_read_ptx_sreg_clock64, "LLi", "n")
+BUILTIN(__nvvm_read_ptx_sreg_globaltimer, "LLi", "n")
BUILTIN(__nvvm_read_ptx_sreg_pm0, "i", "n")
BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 353f3ebb608c2b1..5aab6bee5b1cc1d 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -134,11 +134,13 @@ __device__ long long read_clocks() {
// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
// CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
+// CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
int a = __nvvm_read_ptx_sreg_clock();
long long b = __nvvm_read_ptx_sreg_clock64();
+ long long c = __nvvm_read_ptx_sreg_globaltimer();
- return a + b;
+ return a + b + c;
}
__device__ int read_pms() {
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5a5ba2592e1467e..8c9ed4a349ba998 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4506,6 +4506,8 @@ def int_nvvm_read_ptx_sreg_lanemask_gt :
def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32<"clock">;
def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64<"clock64">;
+def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic_r64<"globaltimer">;
+
def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32<"pm0">;
def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 33f1e4a43e072af..5c509b50411701a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6364,6 +6364,8 @@ def INT_PTX_SREG_CLOCK :
PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
def INT_PTX_SREG_CLOCK64 :
PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
+def INT_PTX_SREG_GLOBALTIMER :
+ PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index c09c7a72fd10181..ba521b2f11cbbec 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -133,6 +133,17 @@ define i64 @test_clock64() {
ret i64 %ret
}
+; CHECK-LABEL: test_globaltimer
+define i64 @test_globaltimer() {
+; CHECK: mov.u64 %r{{.*}}, %globaltimer;
+ %a = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
+; CHECK: mov.u64 %r{{.*}}, %globaltimer;
+ %b = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
+ %ret = add i64 %a, %b
+; CHECK: ret
+ ret i64 %ret
+}
+
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.nvvm.sqrt.f(float)
|
9a07e31
to
cb2503e
Compare
Summary: This patch adds support for `globaltimer` to match `clock` and `clock64`. See the PTX ISA reference fro details. This patch does not implement the `hi` or `lo` variants for brevity as they can be obtained from this with the cost of an additional register. https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi
cb2503e
to
5c4fc3d
Compare
; CHECK-LABEL: test_globaltimer | ||
define i64 @test_globaltimer() { | ||
; CHECK: mov.u64 %r{{.*}}, %globaltimer; | ||
%a = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thise need sm_30+. Right now the test runs with sm_30. LLVM does compile these intrinsics, but ptxas fails because the register is not available on sm_20.
The test needs to be updated to use a reasonably new GPU target. Probably sm_60 is the oldest one anybody still cares about.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I can do that real quick.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in bb18085.
Summary:
This patch adds support for
globaltimer
to matchclock
andclock64
. See the PTX ISA reference for details. This patch does notimplement the
hi
orlo
variants for brevity as they can be obtainedfrom this with the cost of an additional register.
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi