/
utils.h
56 lines (47 loc) · 1.83 KB
/
utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
//===---------------- Implementation of GPU utils ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
#define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
#include "amdgpu/utils.h"
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
#include "nvptx/utils.h"
#else
#include "generic/utils.h"
#endif
namespace LIBC_NAMESPACE {
namespace gpu {
/// Get the first active thread inside the lane.
LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
return __builtin_ffsll(lane_mask) - 1;
}
/// Conditional that is only true for a single thread in a lane.
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
return gpu::get_lane_id() == get_first_lane_id(lane_mask);
}
/// Gets the sum of all lanes inside the warp or wavefront.
LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
uint32_t index = step + gpu::get_lane_id();
x += gpu::shuffle(lane_mask, index, x);
}
return gpu::broadcast_value(lane_mask, x);
}
/// Gets the accumulator scan of the threads in the warp or wavefront.
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
uint32_t index = gpu::get_lane_id() - step;
uint32_t bitmask = gpu::get_lane_id() >= step;
x += -bitmask & gpu::shuffle(lane_mask, index, x);
}
return x;
}
} // namespace gpu
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H