Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[NVPTX] Add intrinsics to support named barriers.
Support for barrier synchronization between a subset of threads in a CTA through one of sixteen explicitly specified barriers. These intrinsics are not directly exposed in CUDA but are critical for forthcoming support of OpenMP on NVPTX GPUs. The intrinsics allow the synchronization of an arbitrary (multiple of 32) number of threads in a CTA at one of 16 distinct barriers. The two intrinsics added are as follows: call void @llvm.nvvm.barrier.n(i32 10) waits for all threads in a CTA to arrive at named barrier #10. call void @llvm.nvvm.barrier(i32 15, i32 992) waits for 992 threads in a CTA to arrive at barrier #15. Detailed description of these intrinsics are available in the PTX manual. http://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions Reviewers: hfinkel, jlebar Differential Revision: https://reviews.llvm.org/D17657 llvm-svn: 293384
- Loading branch information
1 parent
f58469a
commit 2b156ed
Showing
3 changed files
with
53 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s | ||
|
||
; Use bar.sync to arrive at a pre-computed barrier number and | ||
; wait for all threads in CTA to also arrive: | ||
define ptx_device void @test_barrier_named_cta() { | ||
; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0; | ||
; CHECK: bar.sync %r[[REG0]]; | ||
; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10; | ||
; CHECK: bar.sync %r[[REG1]]; | ||
; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15; | ||
; CHECK: bar.sync %r[[REG2]]; | ||
; CHECK: ret; | ||
call void @llvm.nvvm.barrier.n(i32 0) | ||
call void @llvm.nvvm.barrier.n(i32 10) | ||
call void @llvm.nvvm.barrier.n(i32 15) | ||
ret void | ||
} | ||
|
||
; Use bar.sync to arrive at a pre-computed barrier number and | ||
; wait for fixed number of cooperating threads to arrive: | ||
define ptx_device void @test_barrier_named() { | ||
; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32; | ||
; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0; | ||
; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; | ||
; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352; | ||
; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10; | ||
; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; | ||
; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992; | ||
; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15; | ||
; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; | ||
; CHECK: ret; | ||
call void @llvm.nvvm.barrier(i32 0, i32 32) | ||
call void @llvm.nvvm.barrier(i32 10, i32 352) | ||
call void @llvm.nvvm.barrier(i32 15, i32 992) | ||
ret void | ||
} | ||
|
||
declare void @llvm.nvvm.barrier(i32, i32) | ||
declare void @llvm.nvvm.barrier.n(i32) |