forked from kokkos/kokkos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kokkos_Cuda.hpp
288 lines (234 loc) · 8.85 KB
/
Kokkos_Cuda.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
#include <Kokkos_Macros.hpp>
static_assert(false,
"Including non-public Kokkos header files is not allowed.");
#endif
#ifndef KOKKOS_CUDA_HPP
#define KOKKOS_CUDA_HPP
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_ENABLE_CUDA)
#include <Kokkos_Core_fwd.hpp>
#include <iosfwd>
#include <vector>
#include <impl/Kokkos_AnalyzePolicy.hpp>
#include <Cuda/Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskScheduler.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_HostSharedPtr.hpp>
#include <impl/Kokkos_InitializationSettings.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class CudaExec;
class CudaInternal;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace Experimental {
enum class CudaLaunchMechanism : unsigned {
Default = 0,
ConstantMemory = 1,
GlobalMemory = 2,
LocalMemory = 4
};
constexpr inline CudaLaunchMechanism operator|(CudaLaunchMechanism p1,
CudaLaunchMechanism p2) {
return static_cast<CudaLaunchMechanism>(static_cast<unsigned>(p1) |
static_cast<unsigned>(p2));
}
constexpr inline CudaLaunchMechanism operator&(CudaLaunchMechanism p1,
CudaLaunchMechanism p2) {
return static_cast<CudaLaunchMechanism>(static_cast<unsigned>(p1) &
static_cast<unsigned>(p2));
}
template <CudaLaunchMechanism l>
struct CudaDispatchProperties {
CudaLaunchMechanism launch_mechanism = l;
};
} // namespace Experimental
enum class ManageStream : bool { no, yes };
} // namespace Impl
/// \class Cuda
/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
///
/// An "execution space" represents a parallel execution model. It tells Kokkos
/// how to parallelize the execution of kernels in a parallel_for or
/// parallel_reduce. For example, the Threads execution space uses
/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
/// extensions, and the Serial execution space executes "parallel" kernels
/// sequentially. The Cuda execution space uses NVIDIA's CUDA programming
/// model to execute kernels in parallel on GPUs.
class Cuda {
public:
//! \name Type declarations that all Kokkos execution spaces must provide.
//@{
//! Tag this class as a kokkos execution space
using execution_space = Cuda;
#if defined(KOKKOS_ENABLE_CUDA_UVM)
//! This execution space's preferred memory space.
using memory_space = CudaUVMSpace;
#else
//! This execution space's preferred memory space.
using memory_space = CudaSpace;
#endif
//! This execution space preferred device_type
using device_type = Kokkos::Device<execution_space, memory_space>;
//! The size_type best suited for this execution space.
using size_type = memory_space::size_type;
//! This execution space's preferred array layout.
using array_layout = LayoutLeft;
//!
using scratch_memory_space = ScratchMemorySpace<Cuda>;
//@}
//--------------------------------------------------
//! \name Functions that all Kokkos devices must implement.
//@{
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
KOKKOS_INLINE_FUNCTION static int in_parallel() {
#if defined(__CUDA_ARCH__)
return true;
#else
return false;
#endif
}
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void impl_static_fence(const std::string& name);
void fence(const std::string& name =
"Kokkos::Cuda::fence(): Unnamed Instance Fence") const;
/** \brief Return the maximum amount of concurrency. */
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
static int concurrency();
#else
int concurrency() const;
#endif
//! Print configuration information to the given output stream.
void print_configuration(std::ostream& os, bool verbose = false) const;
//@}
//--------------------------------------------------
//! \name Cuda space instances
Cuda();
Cuda(cudaStream_t stream,
Impl::ManageStream manage_stream = Impl::ManageStream::no);
KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream);
//--------------------------------------------------------------------------
//! Free any resources being consumed by the device.
static void impl_finalize();
//! Has been initialized
static int impl_is_initialized();
//! Initialize, telling the CUDA run-time library which device to use.
static void impl_initialize(InitializationSettings const&);
/// \brief Cuda device architecture of the selected device.
///
/// This matches the __CUDA_ARCH__ specification.
static size_type device_arch();
//! Query device count.
static size_type detect_device_count();
/** \brief Detect the available devices and their architecture
* as defined by the __CUDA_ARCH__ specification.
*/
static std::vector<unsigned> detect_device_arch();
cudaStream_t cuda_stream() const;
int cuda_device() const;
const cudaDeviceProp& cuda_device_prop() const;
//@}
//--------------------------------------------------------------------------
static const char* name();
inline Impl::CudaInternal* impl_internal_space_instance() const {
return m_space_instance.get();
}
uint32_t impl_instance_id() const noexcept;
private:
friend bool operator==(Cuda const& lhs, Cuda const& rhs) {
return lhs.impl_internal_space_instance() ==
rhs.impl_internal_space_instance();
}
friend bool operator!=(Cuda const& lhs, Cuda const& rhs) {
return !(lhs == rhs);
}
Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
};
namespace Tools {
namespace Experimental {
template <>
struct DeviceTypeTraits<Cuda> {
/// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling
static constexpr DeviceType id = DeviceType::Cuda;
static int device_id(const Cuda& exec) { return exec.cuda_device(); }
};
} // namespace Experimental
} // namespace Tools
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template <>
struct MemorySpaceAccess<Kokkos::CudaSpace,
Kokkos::Cuda::scratch_memory_space> {
enum : bool { assignable = false };
enum : bool { accessible = true };
enum : bool { deepcopy = false };
};
#if defined(KOKKOS_ENABLE_CUDA_UVM)
// If forcing use of UVM everywhere
// then must assume that CudaUVMSpace
// can be a stand-in for CudaSpace.
// This will fail when a strange host-side execution space
// that defines CudaUVMSpace as its preferredmemory space.
template <>
struct MemorySpaceAccess<Kokkos::CudaUVMSpace,
Kokkos::Cuda::scratch_memory_space> {
enum : bool { assignable = false };
enum : bool { accessible = true };
enum : bool { deepcopy = false };
};
#endif
} // namespace Impl
} // namespace Kokkos
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_HPP */