Skip to content

Commit

Permalink
[NVPTX, CUDA] barrier intrinsics and builtins for sm_90
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D151363
  • Loading branch information
Artem-B committed May 25, 2023
1 parent 0a0bae1 commit 25708b3
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 3 deletions.
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,11 @@ TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", PTX60)
TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60)
TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60)

TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78))
TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80))
TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78))
TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78))

// Shuffle

BUILTIN(__nvvm_shfl_down_i32, "iiii", "")
Expand Down
12 changes: 12 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18962,6 +18962,18 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__nvvm_barrier_cluster_arrive:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
case NVPTX::BI__nvvm_barrier_cluster_wait:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
case NVPTX::BI__nvvm_fence_sc_cluster:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
default:
return nullptr;
}
Expand Down
11 changes: 10 additions & 1 deletion clang/test/CodeGenCUDA/builtins-sm90.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx78" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx80" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s

// CHECK: define{{.*}} void @_Z6kernelPlPvj(
__attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
Expand Down Expand Up @@ -57,5 +57,14 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
// CHECK: call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) {{.*}})
out[i++] = __nvvm_getctarank_shared_cluster(sptr);

// CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
__nvvm_barrier_cluster_arrive();
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
__nvvm_barrier_cluster_arrive_relaxed();
// CHECK: call void @llvm.nvvm.barrier.cluster.wait()
__nvvm_barrier_cluster_wait();
// CHECK: call void @llvm.nvvm.fence.sc.cluster()
__nvvm_fence_sc_cluster();

// CHECK: ret void
}
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -1358,13 +1358,23 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
ClangBuiltin<"__nvvm_barrier_sync_cnt">;

// barrier.cluster.[wait, arrive, arrive.relaxed]
def int_nvvm_barrier_cluster_arrive :
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier_cluster_arrive_relaxed :
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier_cluster_wait :
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;

// Membar
def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">,
Intrinsic<[], [], [IntrNoCallback]>;
def int_nvvm_membar_gl : ClangBuiltin<"__nvvm_membar_gl">,
Intrinsic<[], [], [IntrNoCallback]>;
def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">,
Intrinsic<[], [], [IntrNoCallback]>;
def int_nvvm_fence_sc_cluster:
Intrinsic<[], [], [IntrNoCallback]>;

// Async Copy
def int_nvvm_cp_async_mbarrier_arrive :
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,18 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
Requires<Preds>;

def barrier_cluster_arrive:
INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>;
def barrier_cluster_arrive_relaxed:
INT_BARRIER_CLUSTER<"arrive.relaxed",
int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>;
def barrier_cluster_wait:
INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>;

class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
bit offset_imm, bit mask_imm, bit threadmask_imm>
Expand Down Expand Up @@ -303,6 +315,9 @@ def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;

def INT_FENCE_SC_CLUSTER:
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
Requires<[hasPTX<78>, hasSM<90>]>;

//-----------------------------------
// Async Copy Functions
Expand Down
21 changes: 19 additions & 2 deletions llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}

; CHECK-LABEL: test_isspacep
define i1 @test_isspacep_shared_cluster(ptr %p) {
Expand Down Expand Up @@ -120,6 +120,19 @@ define i1 @test_is_explicit_cluster() {
ret i1 %x
}

; CHECK-LABEL: test_barrier_cluster(
define void @test_barrier_cluster() {
; CHECK: barrier.cluster.arrive;
call void @llvm.nvvm.barrier.cluster.arrive()
; CHECK: barrier.cluster.arrive.relaxed;
call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
; CHECK: barrier.cluster.wait;
call void @llvm.nvvm.barrier.cluster.wait()
; CHECK: fence.sc.cluster
call void @llvm.nvvm.fence.sc.cluster()
ret void
}


declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p);
declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r);
Expand All @@ -137,3 +150,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
declare i1 @llvm.nvvm.is_explicit_cluster()
declare void @llvm.nvvm.barrier.cluster.arrive()
declare void @llvm.nvvm.barrier.cluster.arrive.relaxed()
declare void @llvm.nvvm.barrier.cluster.wait()
declare void @llvm.nvvm.fence.sc.cluster()

0 comments on commit 25708b3

Please sign in to comment.