Skip to content

Commit

Permalink
[NVPTX] Add intrinsics to support named barriers.
Browse files Browse the repository at this point in the history
Support for barrier synchronization between a subset of threads
in a CTA through one of sixteen explicitly specified barriers.
These intrinsics are not directly exposed in CUDA but are
critical for forthcoming support of OpenMP on NVPTX GPUs.

The intrinsics allow the synchronization of an arbitrary
(multiple of 32) number of threads in a CTA at one of 16
distinct barriers. The two intrinsics added are as follows:

call void @llvm.nvvm.barrier.n(i32 10)
waits for all threads in a CTA to arrive at named barrier #10.

call void @llvm.nvvm.barrier(i32 15, i32 992)
waits for 992 threads in a CTA to arrive at barrier #15.

Detailed description of these intrinsics are available in the PTX manual.
http://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions

Reviewers: hfinkel, jlebar
Differential Revision: https://reviews.llvm.org/D17657

llvm-svn: 293384
  • Loading branch information
arpith-jacob committed Jan 28, 2017
1 parent f58469a commit 2b156ed
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 0 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Expand Up @@ -733,6 +733,13 @@ let TargetPrefix = "nvvm" in {
// intrinsics in this file, this one is a user-facing API.
def int_nvvm_barrier0 : GCCBuiltin<"__syncthreads">,
Intrinsic<[], [], [IntrConvergent]>;
// Synchronize all threads in the CTA at barrier 'n'.
def int_nvvm_barrier_n : GCCBuiltin<"__nvvm_bar_n">,
Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>;
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
// the CTA at barrier 'n' (arg 1).
def int_nvvm_barrier : GCCBuiltin<"__nvvm_bar">,
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>;
def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Expand Up @@ -36,6 +36,12 @@ let isConvergent = 1 in {
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
"bar.sync \t0;",
[(int_nvvm_barrier0)]>;
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
"bar.sync \t$src1;",
[(int_nvvm_barrier_n Int32Regs:$src1)]>;
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
"bar.sync \t$src1, $src2;",
[(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>;
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
!strconcat("{{ \n\t",
".reg .pred \t%p1; \n\t",
Expand Down
40 changes: 40 additions & 0 deletions llvm/test/CodeGen/NVPTX/named-barriers.ll
@@ -0,0 +1,40 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s

; Use bar.sync to arrive at a pre-computed barrier number and
; wait for all threads in CTA to also arrive:
define ptx_device void @test_barrier_named_cta() {
; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0;
; CHECK: bar.sync %r[[REG0]];
; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10;
; CHECK: bar.sync %r[[REG1]];
; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15;
; CHECK: bar.sync %r[[REG2]];
; CHECK: ret;
call void @llvm.nvvm.barrier.n(i32 0)
call void @llvm.nvvm.barrier.n(i32 10)
call void @llvm.nvvm.barrier.n(i32 15)
ret void
}

; Use bar.sync to arrive at a pre-computed barrier number and
; wait for fixed number of cooperating threads to arrive:
define ptx_device void @test_barrier_named() {
; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32;
; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0;
; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]];
; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352;
; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10;
; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]];
; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992;
; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15;
; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]];
; CHECK: ret;
call void @llvm.nvvm.barrier(i32 0, i32 32)
call void @llvm.nvvm.barrier(i32 10, i32 352)
call void @llvm.nvvm.barrier(i32 15, i32 992)
ret void
}

declare void @llvm.nvvm.barrier(i32, i32)
declare void @llvm.nvvm.barrier.n(i32)

0 comments on commit 2b156ed

Please sign in to comment.