forked from kokkos/kokkos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kokkos_Cuda_KernelLaunch.hpp
746 lines (638 loc) · 30.7 KB
/
Kokkos_Cuda_KernelLaunch.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#ifndef KOKKOS_CUDAEXEC_HPP
#define KOKKOS_CUDAEXEC_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <mutex>
#include <cstdint>
#include <cmath>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>
#include <impl/Kokkos_GraphImpl_fwd.hpp>
#include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief Access to constant memory on the device */
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__device__ __constant__ extern unsigned long
kokkos_impl_cuda_constant_memory_buffer[];
#else
__device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer
[Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long)];
#endif
template <typename T>
inline __device__ T* kokkos_impl_cuda_shared_memory() {
extern __shared__ Kokkos::CudaSpace::size_type sh[];
return (T*)sh;
}
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
const DriverType& driver =
*((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
driver();
}
template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__global__ __launch_bounds__(
maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() {
const DriverType& driver =
*((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
driver();
}
template <class DriverType>
__global__ static void cuda_parallel_launch_local_memory(
const DriverType driver) {
driver();
}
template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__global__ __launch_bounds__(
maxTperB,
minBperSM) static void cuda_parallel_launch_local_memory(const DriverType
driver) {
driver();
}
template <class DriverType>
__global__ static void cuda_parallel_launch_global_memory(
const DriverType* driver) {
driver->operator()();
}
template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__global__ __launch_bounds__(
maxTperB,
minBperSM) static void cuda_parallel_launch_global_memory(const DriverType*
driver) {
driver->operator()();
}
//==============================================================================
// <editor-fold desc="Some helper functions for launch code readability"> {{{1
inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
return (grid.x == 0) || ((block.x * block.y * block.z) == 0);
}
inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock;
if (maxShmemPerBlock < shmem) {
Kokkos::Impl::throw_runtime_exception(
"CudaParallelLaunch (or graph node creation) FAILED: shared memory "
"request is too large");
}
}
// These functions need to be templated on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
// FIXME_CUDA_MULTIPLE_DEVICES
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
const cudaFuncAttributes& get_cuda_kernel_func_attributes(
const KernelFuncPtr& func) {
// Only call cudaFuncGetAttributes once for each unique kernel
// by leveraging static variable initialization rules
auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
cudaFuncAttributes attr;
KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr,
func)));
return attr;
};
static cudaFuncAttributes func_attr = wrap_get_attributes();
return func_attr;
}
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(const KernelFuncPtr& func,
const cudaDeviceProp& device_props,
const size_t block_size, int& shmem,
const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
const auto& func_attr =
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
// Compute limits for number of blocks due to registers/SM
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
const size_t regs_per_thread = func_attr.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
// Compute the maximum number of warps as a function of the number of
// registers
const size_t max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(device_props, func_attr);
// Correct the number of blocks to respect the maximum number of warps per
// SM, which is constrained to be a multiple of the warp allocation
// granularity defined in `cuda_warp_per_sm_allocation_granularity`.
while ((max_blocks_regs * block_size / device_props.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
const size_t num_threads_desired =
((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
// Get close to the desired occupancy,
// don't undershoot by much but also don't allocate a whole new block just
// because one is a few threads over otherwise.
size_t num_blocks_desired =
(num_threads_desired + block_size * 0.8) / block_size;
num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
if (num_blocks_desired == 0) num_blocks_desired = 1;
// Calculate how much shared memory we need per block
size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
// The minimum shared memory allocation we can have in total per SM is 8kB.
// If we want to lower occupancy we have to make sure we request at least that
// much in aggregate over all blocks, so that shared memory actually becomes a
// limiting factor for occupancy
constexpr size_t min_shmem_size_per_sm = 8192;
if ((occupancy < 100) &&
(shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
// Need to set the caller's shmem variable so that the
// kernel launch uses the correct dynamic shared memory request
shmem = shmem_per_block - func_attr.sharedSizeBytes;
}
// Compute the carveout fraction we need based on occupancy
// Use multiples of 8kB
const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
size_t carveout = shmem_per_block == 0
? 0
: 100 *
(((num_blocks_desired * shmem_per_block +
min_shmem_size_per_sm - 1) /
min_shmem_size_per_sm) *
min_shmem_size_per_sm) /
max_shmem_per_sm;
if (carveout > 100) carveout = 100;
// Set the carveout, but only call it once per kernel or when it changes
// FIXME_CUDA_MULTIPLE_DEVICES
auto set_cache_config = [&] {
KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_func_set_attributes_wrapper(
func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)));
return carveout;
};
// Store the value in a static variable so we only reset if needed
static size_t cache_config_preference_cached = set_cache_config();
if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
(void)device_props;
(void)block_size;
(void)occupancy;
#endif
}
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
//==============================================================================
// <editor-fold desc="DeduceCudaLaunchMechanism"> {{{2
// Use local memory up to ConstantMemoryUseThreshold
// Use global memory above ConstantMemoryUsage
// In between use ConstantMemory
template <class DriverType>
struct DeduceCudaLaunchMechanism {
constexpr static const Kokkos::Experimental::WorkItemProperty::
HintLightWeight_t light_weight =
Kokkos::Experimental::WorkItemProperty::HintLightWeight;
constexpr static const Kokkos::Experimental::WorkItemProperty::
HintHeavyWeight_t heavy_weight =
Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
constexpr static const typename DriverType::Policy::work_item_property
property = typename DriverType::Policy::work_item_property();
static constexpr const Experimental::CudaLaunchMechanism
valid_launch_mechanism =
// BuildValidMask
(sizeof(DriverType) < CudaTraits::KernelArgumentLimit
? Experimental::CudaLaunchMechanism::LocalMemory
: Experimental::CudaLaunchMechanism::Default) |
(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
? Experimental::CudaLaunchMechanism::ConstantMemory
: Experimental::CudaLaunchMechanism::Default) |
Experimental::CudaLaunchMechanism::GlobalMemory;
static constexpr const Experimental::CudaLaunchMechanism
requested_launch_mechanism =
(((property & light_weight) == light_weight)
? Experimental::CudaLaunchMechanism::LocalMemory
: Experimental::CudaLaunchMechanism::ConstantMemory) |
Experimental::CudaLaunchMechanism::GlobalMemory;
static constexpr const Experimental::CudaLaunchMechanism
default_launch_mechanism =
// BuildValidMask
(sizeof(DriverType) < CudaTraits::ConstantMemoryUseThreshold)
? Experimental::CudaLaunchMechanism::LocalMemory
: ((sizeof(DriverType) < CudaTraits::ConstantMemoryUsage)
? Experimental::CudaLaunchMechanism::ConstantMemory
: Experimental::CudaLaunchMechanism::GlobalMemory);
// None LightWeight HeavyWeight
// F<UseT LCG LCG L L LCG LG L L LCG CG L C
// UseT<F<KAL LCG LCG C C LCG LG C L LCG CG C C
// Kal<F<CMU CG LCG C C CG LG C G CG CG C C
// CMU<F G LCG G G G LG G G G CG G G
static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
((property & light_weight) == light_weight)
? (sizeof(DriverType) < CudaTraits::KernelArgumentLimit
? Experimental::CudaLaunchMechanism::LocalMemory
: Experimental::CudaLaunchMechanism::GlobalMemory)
: (((property & heavy_weight) == heavy_weight)
? (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
? Experimental::CudaLaunchMechanism::ConstantMemory
: Experimental::CudaLaunchMechanism::GlobalMemory)
: (default_launch_mechanism));
};
// </editor-fold> end DeduceCudaLaunchMechanism }}}2
//==============================================================================
//==============================================================================
// <editor-fold desc="CudaParallelLaunchKernelInvoker"> {{{1
// Base classes that summarize the differences between the different launch
// mechanisms
template <class DriverType, class LaunchBounds,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunchKernelFunc;
template <class DriverType, class LaunchBounds,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunchKernelInvoker;
//------------------------------------------------------------------------------
// <editor-fold desc="Local memory"> {{{2
template <class DriverType, unsigned int MaxThreadsPerBlock,
unsigned int MinBlocksPerSM>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
Experimental::CudaLaunchMechanism::LocalMemory> {
static std::decay_t<decltype(cuda_parallel_launch_local_memory<
DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_func() {
return cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
MinBlocksPerSM>;
}
};
template <class DriverType>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<0, 0>,
Experimental::CudaLaunchMechanism::LocalMemory> {
static std::decay_t<decltype(cuda_parallel_launch_local_memory<DriverType>)>
get_kernel_func() {
return cuda_parallel_launch_local_memory<DriverType>;
}
};
//------------------------------------------------------------------------------
template <class DriverType, class LaunchBounds>
struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>
: CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::LocalMemory> {
using base_t = CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>;
static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit,
"Kokkos Error: Requested CudaLaunchLocalMemory with a Functor "
"larger than 4096 bytes.");
static void invoke_kernel(DriverType const& driver, dim3 const& grid,
dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
(base_t::get_kernel_func())<<<grid, block, shmem,
cuda_instance->get_stream()>>>(driver);
}
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
// Expect node not yet initialized
KOKKOS_EXPECTS(!bool(graph_node));
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
if constexpr (DriverType::Policy::
experimental_contains_desired_occupancy) {
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
}
void const* args[] = {&driver};
cudaKernelNodeParams params = {};
params.blockDim = block;
params.gridDim = grid;
params.sharedMemBytes = shmem;
params.func = (void*)base_t::get_kernel_func();
params.kernelParams = (void**)args;
params.extra = nullptr;
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_graph_add_kernel_node_wrapper(
&graph_node, graph, /* dependencies = */ nullptr,
/* numDependencies = */ 0, ¶ms)));
} else {
// We still need an empty node for the dependency structure
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_graph_add_empty_node_wrapper(
&graph_node, graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0)));
}
KOKKOS_ENSURES(bool(graph_node))
}
};
// </editor-fold> end local memory }}}2
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// <editor-fold desc="Global Memory"> {{{2
template <class DriverType, unsigned int MaxThreadsPerBlock,
unsigned int MinBlocksPerSM>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
Experimental::CudaLaunchMechanism::GlobalMemory> {
static void* get_kernel_func() {
return cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
MinBlocksPerSM>;
}
};
template <class DriverType>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<0, 0>,
Experimental::CudaLaunchMechanism::GlobalMemory> {
static std::decay_t<decltype(cuda_parallel_launch_global_memory<DriverType>)>
get_kernel_func() {
return cuda_parallel_launch_global_memory<DriverType>;
}
};
//------------------------------------------------------------------------------
template <class DriverType, class LaunchBounds>
struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory>
: CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory> {
using base_t = CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
static void invoke_kernel(DriverType const& driver, dim3 const& grid,
dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
DriverType* driver_ptr = reinterpret_cast<DriverType*>(
cuda_instance->scratch_functor(sizeof(DriverType)));
KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_memcpy_async_wrapper(
driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault)));
(base_t::get_kernel_func())<<<grid, block, shmem,
cuda_instance->get_stream()>>>(driver_ptr);
}
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
// Expect node not yet initialized
KOKKOS_EXPECTS(!bool(graph_node));
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
if constexpr (DriverType::Policy::
experimental_contains_desired_occupancy) {
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
// Unlike in the non-graph case, we can get away with doing an async copy
// here because the `DriverType` instance is held in the GraphNodeImpl
// which is guaranteed to be alive until the graph instance itself is
// destroyed, where there should be a fence ensuring that the allocation
// associated with this kernel on the device side isn't deleted.
KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_memcpy_async_wrapper(
driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault)));
void const* args[] = {&driver_ptr};
cudaKernelNodeParams params = {};
params.blockDim = block;
params.gridDim = grid;
params.sharedMemBytes = shmem;
params.func = (void*)base_t::get_kernel_func();
params.kernelParams = (void**)args;
params.extra = nullptr;
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_graph_add_kernel_node_wrapper(
&graph_node, graph, /* dependencies = */ nullptr,
/* numDependencies = */ 0, ¶ms)));
} else {
// We still need an empty node for the dependency structure
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_graph_add_empty_node_wrapper(
&graph_node, graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0)));
}
KOKKOS_ENSURES(bool(graph_node))
}
};
// </editor-fold> end Global Memory }}}2
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// <editor-fold desc="Constant Memory"> {{{2
template <class DriverType, unsigned int MaxThreadsPerBlock,
unsigned int MinBlocksPerSM>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
Experimental::CudaLaunchMechanism::ConstantMemory> {
static std::decay_t<decltype(cuda_parallel_launch_constant_memory<
DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_func() {
return cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
MinBlocksPerSM>;
}
};
template <class DriverType>
struct CudaParallelLaunchKernelFunc<
DriverType, Kokkos::LaunchBounds<0, 0>,
Experimental::CudaLaunchMechanism::ConstantMemory> {
static std::decay_t<
decltype(cuda_parallel_launch_constant_memory<DriverType>)>
get_kernel_func() {
return cuda_parallel_launch_constant_memory<DriverType>;
}
};
//------------------------------------------------------------------------------
template <class DriverType, class LaunchBounds>
struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory>
: CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::ConstantMemory> {
using base_t = CudaParallelLaunchKernelFunc<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::ConstantMemory>;
static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage,
"Kokkos Error: Requested CudaLaunchConstantMemory with a "
"Functor larger than 32kB.");
static void invoke_kernel(DriverType const& driver, dim3 const& grid,
dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
int cuda_device = cuda_instance->m_cudaDev;
// Wait until the previous kernel that uses the constant buffer is done
std::lock_guard<std::mutex> lock(
CudaInternal::constantMemMutexPerDevice[cuda_device]);
KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper(
CudaInternal::constantMemReusablePerDevice[cuda_device])));
// Copy functor (synchronously) to staging buffer in pinned host memory
unsigned long* staging =
cuda_instance->constantMemHostStagingPerDevice[cuda_device];
memcpy(staging, &driver, sizeof(DriverType));
// Copy functor asynchronously from there to constant memory on the device
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_memcpy_to_symbol_async_wrapper(
kokkos_impl_cuda_constant_memory_buffer, staging,
sizeof(DriverType), 0, cudaMemcpyHostToDevice)));
// Invoke the driver function on the device
(base_t::get_kernel_func())<<<grid, block, shmem,
cuda_instance->get_stream()>>>();
// Record an event that says when the constant buffer can be reused
KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper(
CudaInternal::constantMemReusablePerDevice[cuda_device])));
}
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
// a much more complicated structure that finds previous nodes in the
// dependency structure of the graph and creates an implicit dependence
// based on the need for constant memory (which we would then have to
// somehow go and prove was not creating a dependency cycle, and I don't
// even know if there's an efficient way to do that, let alone in the
// structure we currenty have).
using global_launch_impl_t = CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
driver, grid, block, shmem, cuda_instance);
}
};
// </editor-fold> end Constant Memory }}}2
//------------------------------------------------------------------------------
// </editor-fold> end CudaParallelLaunchKernelInvoker }}}1
//==============================================================================
//==============================================================================
// <editor-fold desc="CudaParallelLaunchImpl"> {{{1
template <class DriverType, class LaunchBounds,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunchImpl;
template <class DriverType, unsigned int MaxThreadsPerBlock,
unsigned int MinBlocksPerSM,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunchImpl<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
LaunchMechanism>
: CudaParallelLaunchKernelInvoker<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
LaunchMechanism> {
using base_t = CudaParallelLaunchKernelInvoker<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
LaunchMechanism>;
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
Impl::check_shmem_request(cuda_instance, shmem);
if constexpr (DriverType::Policy::
experimental_contains_desired_occupancy) {
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
}
desul::ensure_cuda_lock_arrays_on_device();
// Invoke the driver function on the device
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_instance->cuda_get_last_error_wrapper()));
cuda_instance->fence(
"Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
#endif
}
}
static cudaFuncAttributes get_cuda_func_attributes() {
return get_cuda_kernel_func_attributes<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func());
}
};
// </editor-fold> end CudaParallelLaunchImpl }}}1
//==============================================================================
//==============================================================================
// <editor-fold desc="CudaParallelLaunch"> {{{1
template <class DriverType, class LaunchBounds = Kokkos::LaunchBounds<>,
Experimental::CudaLaunchMechanism LaunchMechanism =
DeduceCudaLaunchMechanism<DriverType>::launch_mechanism,
bool DoGraph = DriverType::Policy::is_graph_kernel::value>
struct CudaParallelLaunch;
// General launch mechanism
template <class DriverType, class LaunchBounds,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
/* DoGraph = */ false>
: CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
using base_t =
CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
template <class... Args>
CudaParallelLaunch(Args&&... args) {
base_t::launch_kernel((Args &&) args...);
}
};
// Launch mechanism for creating graph nodes
template <class DriverType, class LaunchBounds,
Experimental::CudaLaunchMechanism LaunchMechanism>
struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
/* DoGraph = */ true>
: CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
using base_t =
CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
template <class... Args>
CudaParallelLaunch(Args&&... args) {
base_t::create_parallel_launch_graph_node((Args &&) args...);
}
};
// </editor-fold> end CudaParallelLaunch }}}1
//==============================================================================
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */