Skip to content

Commit

Permalink
[NVPTX] Implemented shfl.sync instruction and supporting intrinsics/b…
Browse files Browse the repository at this point in the history
…uiltins.

Differential Revision: https://reviews.llvm.org/D38090

llvm-svn: 313820
  • Loading branch information
Artem-B committed Sep 20, 2017
1 parent 562bf99 commit 4654dc8
Show file tree
Hide file tree
Showing 8 changed files with 360 additions and 6 deletions.
9 changes: 9 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Expand Up @@ -390,6 +390,15 @@ BUILTIN(__nvvm_shfl_bfly_f32, "ffii", "")
BUILTIN(__nvvm_shfl_idx_i32, "iiii", "")
BUILTIN(__nvvm_shfl_idx_f32, "ffii", "")

TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iUiiii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iUiiii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iUiiii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", "ptx60")
TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60")

// Membar

BUILTIN(__nvvm_membar_cta, "v", "")
Expand Down
16 changes: 11 additions & 5 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Expand Up @@ -507,11 +507,17 @@ void CudaToolChain::addClangTargetOptions(
CC1Args.push_back("-mlink-cuda-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));

// Libdevice in CUDA-7.0 requires PTX version that's more recent
// than LLVM defaults to. Use PTX4.2 which is the PTX version that
// came with CUDA-7.0.
CC1Args.push_back("-target-feature");
CC1Args.push_back("+ptx42");
if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
// CUDA-9 uses new instructions that are only available in PTX6.0
CC1Args.push_back("-target-feature");
CC1Args.push_back("+ptx60");
} else {
// Libdevice in CUDA-7.0 requires PTX version that's more recent
// than LLVM defaults to. Use PTX4.2 which is the PTX version that
// came with CUDA-7.0.
CC1Args.push_back("-target-feature");
CC1Args.push_back("+ptx42");
}
}

void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Expand Down
68 changes: 68 additions & 0 deletions clang/lib/Headers/__clang_cuda_intrinsics.h
Expand Up @@ -92,6 +92,74 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);

#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

// __shfl_sync_* variants available in CUDA-9
#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
__Mask) \
inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(unsigned int __mask, float __val, \
int __offset, int __width = warpSize) { \
return __FloatIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __mask, \
unsigned int __val, int __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
int __offset, int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__val, &__tmp, sizeof(__val)); \
__tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, int __offset, \
int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(unsigned int __mask, double __val, \
int __offset, int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__mask, __tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
__nvvm_shfl_sync_idx_f32, 0x1f);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SYNC_SHUFFLES(__shfl_sync_up, __nvvm_shfl_sync_up_i32,
__nvvm_shfl_sync_up_f32, 0);
__MAKE_SYNC_SHUFFLES(__shfl_sync_down, __nvvm_shfl_sync_down_i32,
__nvvm_shfl_sync_down_f32, 0x1f);
__MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32,
__nvvm_shfl_sync_bfly_f32, 0x1f);

#pragma pop_macro("__MAKE_SYNC_SHUFFLES")

#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) ||
// __CUDA_ARCH__ >= 300)

// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.

// Prevent the vanilla sm_32 intrinsics header from being included.
Expand Down
40 changes: 40 additions & 0 deletions clang/test/CodeGen/builtins-nvptx-ptx60.cu
@@ -0,0 +1,40 @@
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -target-feature +ptx60 \
// RUN: -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK %s
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))

// CHECK-LABEL: nvvm_shfl_sync
__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) {
// CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 {{%[0-9]+}}, i32
// expected-error@+1 {{'__nvvm_shfl_sync_down_i32' needs target feature ptx60}}
__nvvm_shfl_sync_down_i32(mask, i, a, b);
// CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 {{%[0-9]+}}, float
// expected-error@+1 {{'__nvvm_shfl_sync_down_f32' needs target feature ptx60}}
__nvvm_shfl_sync_down_f32(mask, f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 {{%[0-9]+}}, i32
// expected-error@+1 {{'__nvvm_shfl_sync_up_i32' needs target feature ptx60}}
__nvvm_shfl_sync_up_i32(mask, i, a, b);
// CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 {{%[0-9]+}}, float
// expected-error@+1 {{'__nvvm_shfl_sync_up_f32' needs target feature ptx60}}
__nvvm_shfl_sync_up_f32(mask, f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 {{%[0-9]+}}, i32
// expected-error@+1 {{'__nvvm_shfl_sync_bfly_i32' needs target feature ptx60}}
__nvvm_shfl_sync_bfly_i32(mask, i, a, b);
// CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 {{%[0-9]+}}, float
// expected-error@+1 {{'__nvvm_shfl_sync_bfly_f32' needs target feature ptx60}}
__nvvm_shfl_sync_bfly_f32(mask, f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 {{%[0-9]+}}, i32
// expected-error@+1 {{'__nvvm_shfl_sync_idx_i32' needs target feature ptx60}}
__nvvm_shfl_sync_idx_i32(mask, i, a, b);
// CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 {{%[0-9]+}}, float
// expected-error@+1 {{'__nvvm_shfl_sync_idx_f32' needs target feature ptx60}}
__nvvm_shfl_sync_idx_f32(mask, f, a, b);
// CHECK: ret void
}
21 changes: 21 additions & 0 deletions clang/test/CodeGen/builtins-nvptx.c
Expand Up @@ -636,3 +636,24 @@ __device__ void nvvm_ldg(const void *p) {
typedef double double2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_d2((const double2 *)p);
}

// CHECK-LABEL: nvvm_shfl
__device__ void nvvm_shfl(int i, float f, int a, int b) {
// CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
__nvvm_shfl_down_i32(i, a, b);
// CHECK: call float @llvm.nvvm.shfl.down.f32(float
__nvvm_shfl_down_f32(f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
__nvvm_shfl_up_i32(i, a, b);
// CHECK: call float @llvm.nvvm.shfl.up.f32(float
__nvvm_shfl_up_f32(f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
__nvvm_shfl_bfly_i32(i, a, b);
// CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
__nvvm_shfl_bfly_f32(f, a, b);
// CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
__nvvm_shfl_idx_i32(i, a, b);
// CHECK: call float @llvm.nvvm.shfl.idx.f32(float
__nvvm_shfl_idx_f32(f, a, b);
// CHECK: ret void
}
44 changes: 44 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Expand Up @@ -3736,4 +3736,48 @@ def int_nvvm_shfl_idx_f32 :
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
GCCBuiltin<"__nvvm_shfl_idx_f32">;

// Synchronizing shfl variants available in CUDA-9.
// On sm_70 these don't have to be convergent, so we may eventually want to
// implement non-convergent variant of this intrinsic.

// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
def int_nvvm_shfl_sync_down_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
def int_nvvm_shfl_sync_down_f32 :
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
GCCBuiltin<"__nvvm_shfl_sync_down_f32">;

// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
def int_nvvm_shfl_sync_up_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
def int_nvvm_shfl_sync_up_f32 :
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
GCCBuiltin<"__nvvm_shfl_sync_up_f32">;

// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
def int_nvvm_shfl_sync_bfly_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
def int_nvvm_shfl_sync_bfly_f32 :
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;

// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
def int_nvvm_shfl_sync_idx_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
def int_nvvm_shfl_sync_idx_f32 :
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
}
74 changes: 73 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Expand Up @@ -111,8 +111,80 @@ defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;

} // isConvergent = 1
multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
// Threadmask and the last two parameters to shfl.sync can be regs or imms.
// ptxas is smart enough to inline constant registers, so strictly speaking we
// don't need to handle immediates here. But it's easy enough, and it makes
// our ptx more readable.
def rrr : NVPTXInst<
(outs regclass:$dst),
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
Int32Regs:$offset, Int32Regs:$mask))]>;

def rri : NVPTXInst<
(outs regclass:$dst),
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
Int32Regs:$offset, imm:$mask))]>;

def rir : NVPTXInst<
(outs regclass:$dst),
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
imm:$offset, Int32Regs:$mask))]>;

def rii : NVPTXInst<
(outs regclass:$dst),
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
imm:$offset, imm:$mask))]>;

def irr : NVPTXInst<
(outs regclass:$dst),
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
Int32Regs:$offset, Int32Regs:$mask))]>;

def iri : NVPTXInst<
(outs regclass:$dst),
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
Int32Regs:$offset, imm:$mask))]>;

def iir : NVPTXInst<
(outs regclass:$dst),
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
imm:$offset, Int32Regs:$mask))]>;

def iii : NVPTXInst<
(outs regclass:$dst),
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
imm:$offset, imm:$mask))]>;
}

// On sm_70 these don't have to be convergent, so we may eventually want to
// implement non-convergent variant of this intrinsic.
defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;

} // isConvergent = 1

//-----------------------------------
// Explicit Memory Fence Functions
Expand Down

0 comments on commit 4654dc8

Please sign in to comment.