/
rtl.cpp
2582 lines (2087 loc) · 93.3 KB
/
rtl.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// RTL NextGen for AMDGPU machine
//
//===----------------------------------------------------------------------===//
#include <atomic>
#include <cassert>
#include <cstddef>
#include <deque>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <mutex>
#include <string>
#include <system_error>
#include <unistd.h>
#include <unordered_map>
#include "Debug.h"
#include "DeviceEnvironment.h"
#include "GlobalHandler.h"
#include "PluginInterface.h"
#include "Utilities.h"
#include "UtilitiesRTL.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace omp {
namespace target {
namespace plugin {
/// Forward declarations for all specialized data structures.
struct AMDGPUKernelTy;
struct AMDGPUDeviceTy;
struct AMDGPUPluginTy;
struct AMDGPUStreamTy;
struct AMDGPUEventTy;
struct AMDGPUStreamManagerTy;
struct AMDGPUEventManagerTy;
struct AMDGPUDeviceImageTy;
struct AMDGPUMemoryManagerTy;
struct AMDGPUMemoryPoolTy;
namespace utils {
/// Iterate elements using an HSA iterate function. Do not use this function
/// directly but the specialized ones below instead.
template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem);
};
return Func(L, static_cast<void *>(&Cb));
}
/// Iterate elements using an HSA iterate function passing a parameter. Do not
/// use this function directly but the specialized ones below instead.
template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem);
};
return Func(FuncArg, L, static_cast<void *>(&Cb));
}
/// Iterate elements using an HSA iterate function passing a parameter. Do not
/// use this function directly but the specialized ones below instead.
template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
typename IterFuncArgTy, typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem1, Elem2);
};
return Func(FuncArg, L, static_cast<void *>(&Cb));
}
/// Iterate agents.
template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
}
/// Iterate ISAs of an agent.
template <typename CallbackTy>
Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
}
/// Iterate memory pools of an agent.
template <typename CallbackTy>
Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
hsa_amd_agent_iterate_memory_pools, Agent, Cb);
return Plugin::check(Status,
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}
} // namespace utils
/// Utility class representing generic resource references to AMDGPU resources.
template <typename ResourceTy>
struct AMDGPUResourceRef : public GenericDeviceResourceRef {
/// Create an empty reference to an invalid resource.
AMDGPUResourceRef() : Resource(nullptr) {}
/// Create a reference to an existing resource.
AMDGPUResourceRef(ResourceTy *Resource) : Resource(Resource) {}
virtual ~AMDGPUResourceRef() {}
/// Create a new resource and save the reference. The reference must be empty
/// before calling to this function.
Error create(GenericDeviceTy &Device) override;
/// Destroy the referenced resource and invalidate the reference. The
/// reference must be to a valid event before calling to this function.
Error destroy(GenericDeviceTy &Device) override {
if (!Resource)
return Plugin::error("Destroying an invalid resource");
if (auto Err = Resource->deinit())
return Err;
delete Resource;
Resource = nullptr;
return Plugin::success();
}
/// Get the underlying AMDGPUSignalTy reference.
operator ResourceTy *() const { return Resource; }
private:
/// The reference to the actual resource.
ResourceTy *Resource;
};
/// Class holding an HSA memory pool.
struct AMDGPUMemoryPoolTy {
/// Create a memory pool from an HSA memory pool.
AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
: MemoryPool(MemoryPool), GlobalFlags(0) {}
/// Initialize the memory pool retrieving its properties.
Error init() {
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
return Err;
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;
return Plugin::success();
}
/// Getter of the HSA memory pool.
hsa_amd_memory_pool_t get() const { return MemoryPool; }
/// Indicate if it belongs to the global segment.
bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
/// Indicate if it is fine-grained memory. Valid only for global.
bool isFineGrained() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
}
/// Indicate if it is coarse-grained memory. Valid only for global.
bool isCoarseGrained() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
}
/// Indicate if it supports storing kernel arguments. Valid only for global.
bool supportsKernelArgs() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
}
/// Allocate memory on the memory pool.
Error allocate(size_t Size, void **PtrStorage) {
hsa_status_t Status =
hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
}
/// Return memory to the memory pool.
Error deallocate(void *Ptr) {
hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
}
/// Allow the device to access a specific allocation.
Error enableAccess(void *Ptr, int64_t Size,
const llvm::SmallVector<hsa_agent_t> &Agents) const {
#ifdef OMPTARGET_DEBUG
for (hsa_agent_t Agent : Agents) {
hsa_amd_memory_pool_access_t Access;
if (auto Err =
getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
return Err;
// The agent is not allowed to access the memory pool in any case. Do not
// continue because otherwise it result in undefined behavior.
if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
return Plugin::error("An agent is not allowed to access a memory pool");
}
#endif
// We can access but it is disabled by default. Enable the access then.
hsa_status_t Status =
hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
}
private:
/// Get attribute from the memory pool.
template <typename Ty>
Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
hsa_status_t Status;
Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
}
/// Get attribute from the memory pool relating to an agent.
template <typename Ty>
Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
Ty &Value) const {
hsa_status_t Status;
Status =
hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
return Plugin::check(Status,
"Error in hsa_amd_agent_memory_pool_get_info: %s");
}
/// The HSA memory pool.
hsa_amd_memory_pool_t MemoryPool;
/// The segment where the memory pool belongs to.
hsa_amd_segment_t Segment;
/// The global flags of memory pool. Only valid if the memory pool belongs to
/// the global segment.
uint32_t GlobalFlags;
};
/// Class that implements a memory manager that gets memory from a specific
/// memory pool.
struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
/// Create an empty memory manager.
AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {}
/// Initialize the memory manager from a memory pool.
Error init(AMDGPUMemoryPoolTy &MemoryPool) {
const uint32_t Threshold = 1 << 30;
this->MemoryManager = new MemoryManagerTy(*this, Threshold);
this->MemoryPool = &MemoryPool;
return Plugin::success();
}
/// Deinitialize the memory manager and free its allocations.
Error deinit() {
assert(MemoryManager && "Invalid memory manager");
// Delete and invalidate the memory manager. At this point, the memory
// manager will deallocate all its allocations.
delete MemoryManager;
MemoryManager = nullptr;
return Plugin::success();
}
/// Reuse or allocate memory through the memory manager.
Error allocate(size_t Size, void **PtrStorage) {
assert(MemoryManager && "Invalid memory manager");
assert(PtrStorage && "Invalid pointer storage");
*PtrStorage = MemoryManager->allocate(Size, nullptr);
if (*PtrStorage == nullptr)
return Plugin::error("Failure to allocate from AMDGPU memory manager");
return Plugin::success();
}
/// Release an allocation to be reused.
Error deallocate(void *Ptr) {
assert(Ptr && "Invalid pointer");
if (MemoryManager->free(Ptr))
return Plugin::error("Failure to deallocate from AMDGPU memory manager");
return Plugin::success();
}
private:
/// Allocation callback that will be called once the memory manager does not
/// have more previously allocated buffers.
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
/// Deallocation callack that will be called by the memory manager.
int free(void *TgtPtr, TargetAllocTy Kind) override {
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
consumeError(std::move(Err));
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
/// The memory pool used to allocate memory.
AMDGPUMemoryPoolTy *MemoryPool;
/// Reference to the actual memory manager.
MemoryManagerTy *MemoryManager;
};
/// Class implementing the AMDGPU device images' properties.
struct AMDGPUDeviceImageTy : public DeviceImageTy {
/// Create the AMDGPU image with the id and the target image pointer.
AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
: DeviceImageTy(ImageId, TgtImage) {}
/// Prepare and load the executable corresponding to the image.
Error loadExecutable(const AMDGPUDeviceTy &Device);
/// Unload the executable.
Error unloadExecutable() {
hsa_status_t Status = hsa_executable_destroy(Executable);
if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
return Err;
Status = hsa_code_object_destroy(CodeObject);
return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
}
/// Get the executable.
hsa_executable_t getExecutable() const { return Executable; }
/// Find an HSA device symbol by its name on the executable.
Expected<hsa_executable_symbol_t>
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
private:
/// The exectuable loaded on the agent.
hsa_executable_t Executable;
hsa_code_object_t CodeObject;
};
/// Class implementing the AMDGPU kernel functionalities which derives from the
/// generic kernel class.
struct AMDGPUKernelTy : public GenericKernelTy {
/// Create an AMDGPU kernel with a name and an execution mode.
AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
: GenericKernelTy(Name, ExecutionMode),
ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
/// Initialize the AMDGPU kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
// Kernel symbols have a ".kd" suffix.
std::string KernelName(getName());
KernelName += ".kd";
// Find the symbol on the device executable.
auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
if (!SymbolOrErr)
return SymbolOrErr.takeError();
hsa_executable_symbol_t Symbol = *SymbolOrErr;
hsa_symbol_kind_t SymbolType;
hsa_status_t Status;
// Retrieve different properties of the kernel symbol.
std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
for (auto &Info : RequiredInfos) {
Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
if (auto Err = Plugin::check(
Status, "Error in hsa_executable_symbol_get_info: %s"))
return Err;
}
// Account for user requested dynamic shared memory.
// TODO: This should be read from a per-kernel state flag.
GroupSize += Device.getDynamicMemorySize();
// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error("Symbol %s is not a kernel function");
// TODO: Read the kernel descriptor for the max threads per block. May be
// read from the image.
return Plugin::success();
}
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, uint32_t DynamicMemorySize,
int32_t NumKernelArgs, void *KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// The default number of blocks is common to the whole device.
uint64_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override {
return GenericDevice.getDefaultNumBlocks();
}
/// The default number of threads is common to the whole device.
uint32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override {
return GenericDevice.getDefaultNumThreads();
}
/// Get group and private segment kernel size.
uint32_t getGroupSize() const { return GroupSize; }
uint32_t getPrivateSize() const { return PrivateSize; }
/// Get the HSA kernel object representing the kernel function.
uint64_t getKernelObject() const { return KernelObject; }
private:
/// The kernel object to execute.
uint64_t KernelObject;
/// The args, group and private segments sizes required by a kernel instance.
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
/// The size of implicit kernel arguments.
const uint32_t ImplicitArgsSize;
};
/// Class representing an HSA signal. Signals are used to define dependencies
/// between asynchronous operations: kernel launches and memory transfers.
struct AMDGPUSignalTy {
/// Create an empty signal.
AMDGPUSignalTy() : Signal({0}), UseCount() {}
AMDGPUSignalTy(AMDGPUDeviceTy &Device) : Signal({0}), UseCount() {}
/// Initialize the signal with an initial value.
Error init(uint32_t InitialValue = 1) {
hsa_status_t Status =
hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &Signal);
return Plugin::check(Status, "Error in hsa_signal_create: %s");
}
/// Deinitialize the signal.
Error deinit() {
hsa_status_t Status = hsa_signal_destroy(Signal);
return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
}
/// Wait until the signal gets a zero value.
Error wait() const {
// TODO: Is it better to use busy waiting or blocking the thread?
while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0)
;
return Plugin::success();
}
/// Load the value on the signal.
hsa_signal_value_t load() const { return hsa_signal_load_scacquire(Signal); }
/// Signal decrementing by one.
void signal() {
assert(load() > 0 && "Invalid signal value");
hsa_signal_subtract_screlease(Signal, 1);
}
/// Reset the signal value before reusing the signal. Do not call this
/// function if the signal is being currently used by any watcher, such as a
/// plugin thread or the HSA runtime.
void reset() { hsa_signal_store_screlease(Signal, 1); }
/// Increase the number of concurrent uses.
void increaseUseCount() { UseCount.increase(); }
/// Decrease the number of concurrent uses and return whether was the last.
bool decreaseUseCount() { return UseCount.decrease(); }
hsa_signal_t get() const { return Signal; }
private:
/// The underlying HSA signal.
hsa_signal_t Signal;
/// Reference counter for tracking the concurrent use count. This is mainly
/// used for knowing how many streams are using the signal.
RefCountTy<> UseCount;
};
/// Classes for holding AMDGPU signals and managing signals.
using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
/// Class holding an HSA queue to submit kernel and barrier packets.
struct AMDGPUQueueTy {
/// Create an empty queue.
AMDGPUQueueTy() : Queue(nullptr), Mutex() {}
/// Initialize a new queue belonging to a specific agent.
Error init(hsa_agent_t Agent, int32_t QueueSize) {
hsa_status_t Status =
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
return Plugin::check(Status, "Error in hsa_queue_create: %s");
}
/// Deinitialize the queue and destroy its resources.
Error deinit() {
hsa_status_t Status = hsa_queue_destroy(Queue);
return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
}
/// Push a kernel launch to the queue. The kernel launch requires an output
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
// Lock the queue during the packet publishing process. Notice this blocks
// the addition of other packets to the queue. The following piece of code
// should be lightweight; do not block the thread, allocate memory, etc.
std::lock_guard<std::mutex> Lock(Mutex);
// Avoid defining the input dependency if already satisfied.
if (InputSignal && !InputSignal->load())
InputSignal = nullptr;
// Add a barrier packet before the kernel packet in case there is a pending
// preceding operation. The barrier packet will delay the processing of
// subsequent queue's packets until the barrier input signal are satisfied.
// No need output signal needed because the dependency is already guaranteed
// by the queue barrier itself.
if (InputSignal)
if (auto Err = pushBarrierImpl(nullptr, InputSignal))
return Err;
// Now prepare the kernel packet.
uint64_t PacketId;
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
assert(Packet && "Invalid packet");
// The header of the packet is written in the last moment.
Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
Packet->reserved0 = 0;
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size = Kernel.getPrivateSize();
Packet->group_segment_size = Kernel.getGroupSize();
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
Packet->reserved2 = 0;
Packet->completion_signal = OutputSignal->get();
// Publish the packet. Do not modify the packet after this point.
publishKernelPacket(PacketId, Packet);
return Plugin::success();
}
/// Push a barrier packet that will wait up to two input signals. All signals
/// are optional (nullptr if none).
Error pushBarrier(AMDGPUSignalTy *OutputSignal,
const AMDGPUSignalTy *InputSignal1,
const AMDGPUSignalTy *InputSignal2) {
// Lock the queue during the packet publishing process.
std::lock_guard<std::mutex> Lock(Mutex);
// Push the barrier with the lock acquired.
return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
}
private:
/// Push a barrier packet that will wait up to two input signals. Assumes the
/// the queue lock is acquired.
Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
const AMDGPUSignalTy *InputSignal1,
const AMDGPUSignalTy *InputSignal2 = nullptr) {
// Add a queue barrier waiting on both the other stream's operation and the
// last operation on the current stream (if any).
uint64_t PacketId;
hsa_barrier_and_packet_t *Packet =
(hsa_barrier_and_packet_t *)acquirePacket(PacketId);
assert(Packet && "Invalid packet");
Packet->reserved0 = 0;
Packet->reserved1 = 0;
Packet->dep_signal[0] = {0};
Packet->dep_signal[1] = {0};
Packet->dep_signal[2] = {0};
Packet->dep_signal[3] = {0};
Packet->dep_signal[4] = {0};
Packet->reserved2 = 0;
Packet->completion_signal = {0};
// Set input and output dependencies if needed.
if (OutputSignal)
Packet->completion_signal = OutputSignal->get();
if (InputSignal1)
Packet->dep_signal[0] = InputSignal1->get();
if (InputSignal2)
Packet->dep_signal[1] = InputSignal2->get();
// Publish the packet. Do not modify the packet after this point.
publishBarrierPacket(PacketId, Packet);
return Plugin::success();
}
/// Acquire a packet from the queue. This call may block the thread if there
/// is no space in the underlying HSA queue. It may need to wait until the HSA
/// runtime processes some packets. Assumes the queue lock is acquired.
hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
// Increase the queue index with relaxed memory order. Notice this will need
// another subsequent atomic operation with acquire order.
PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
// Wait for the package to be available. Notice the atomic operation uses
// the acquire memory order.
while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
;
// Return the packet reference.
const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
(PacketId & Mask);
}
/// Publish the kernel packet so that the HSA runtime can start processing
/// the kernel launch. Do not modify the packet once this function is called.
/// Assumes the queue lock is acquired.
void publishKernelPacket(uint64_t PacketId,
hsa_kernel_dispatch_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
uint16_t Setup = Packet->setup;
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Publish the packet. Do not modify the package after this point.
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
}
/// Publish the barrier packet so that the HSA runtime can start processing
/// the barrier. Next packets in the queue will not be processed until all
/// barrier dependencies (signals) are satisfied. Assumes the queue is locked
void publishBarrierPacket(uint64_t PacketId,
hsa_barrier_and_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
uint16_t Setup = 0;
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Publish the packet. Do not modify the package after this point.
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
}
/// Callack that will be called when an error is detected on the HSA queue.
static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
}
/// The HSA queue.
hsa_queue_t *Queue;
/// Mutex to protect the acquiring and publishing of packets. For the moment,
/// we need this mutex to prevent publishing packets that are not ready to be
/// published in a multi-thread scenario. Without a queue lock, a thread T1
/// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
/// publish its packet P+1 (signaling the queue's doorbell) before packet P
/// from T1 is ready to be processed. That scenario should be invalid. Thus,
/// we use the following mutex to make packet acquiring and publishing atomic.
/// TODO: There are other more advanced approaches to avoid this mutex using
/// atomic operations. We can further investigate it if this is a bottleneck.
std::mutex Mutex;
};
/// Struct that implements a stream of asynchronous operations for AMDGPU
/// devices. This class relies on signals to implement streams and define the
/// dependencies between asynchronous operations.
struct AMDGPUStreamTy {
private:
/// Utility struct holding arguments for async H2H memory copies.
struct MemcpyArgsTy {
void *Dst;
const void *Src;
size_t Size;
};
/// Utility struct holding arguments for freeing buffers to memory managers.
struct ReleaseBufferArgsTy {
void *Buffer;
AMDGPUMemoryManagerTy *MemoryManager;
};
/// Utility struct holding arguments for releasing signals to signal managers.
struct ReleaseSignalArgsTy {
AMDGPUSignalTy *Signal;
AMDGPUSignalManagerTy *SignalManager;
};
/// The stream is composed of N stream's slots. The struct below represents
/// the fields of each slot. Each slot has a signal and an optional action
/// function. When appending an HSA asynchronous operation to the stream, one
/// slot is consumed and used to store the operation's information. The
/// operation's output signal is set to the consumed slot's signal. If there
/// is a previous asynchronous operation on the previous slot, the HSA async
/// operation's input signal is set to the signal of the previous slot. This
/// way, we obtain a chain of dependant async operations. The action is a
/// function that will be executed eventually after the operation is
/// completed, e.g., for releasing a buffer.
struct StreamSlotTy {
/// The output signal of the stream operation. May be used by the subsequent
/// operation as input signal.
AMDGPUSignalTy *Signal;
/// The action that must be performed after the operation's completion. Set
/// to nullptr when there is no action to perform.
Error (*ActionFunction)(void *);
/// Space for the action's arguments. A pointer to these arguments is passed
/// to the action function. Notice the space of arguments is limited.
union {
MemcpyArgsTy MemcpyArgs;
ReleaseBufferArgsTy ReleaseBufferArgs;
ReleaseSignalArgsTy ReleaseSignalArgs;
} ActionArgs;
/// Create an empty slot.
StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
/// Schedule a host memory copy action on the slot.
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
ActionFunction = memcpyAction;
ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
return Plugin::success();
}
/// Schedule a release buffer action on the slot.
Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
ActionFunction = releaseBufferAction;
ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
return Plugin::success();
}
/// Schedule a release buffer action on the slot.
Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
AMDGPUSignalManagerTy *SignalManager) {
ActionFunction = releaseSignalAction;
ActionArgs.ReleaseSignalArgs =
ReleaseSignalArgsTy{SignalToRelease, SignalManager};
return Plugin::success();
}
// Perform the action if needed.
Error performAction() {
if (!ActionFunction)
return Plugin::success();
// Perform the action.
if (ActionFunction == memcpyAction) {
if (auto Err = memcpyAction(&ActionArgs))
return Err;
} else if (ActionFunction == releaseBufferAction) {
if (auto Err = releaseBufferAction(&ActionArgs))
return Err;
} else if (ActionFunction == releaseSignalAction) {
if (auto Err = releaseSignalAction(&ActionArgs))
return Err;
} else {
return Plugin::error("Unknown action function!");
}
// Invalidate the action.
ActionFunction = nullptr;
return Plugin::success();
}
};
/// The device agent where the stream was created.
hsa_agent_t Agent;
/// The queue that the stream uses to launch kernels.
AMDGPUQueueTy &Queue;
/// The manager of signals to reuse signals.
AMDGPUSignalManagerTy &SignalManager;
/// Array of stream slots. Use std::deque because it can dynamically grow
/// without invalidating the already inserted elements. For instance, the
/// std::vector may invalidate the elements by reallocating the internal
/// array if there is not enough space on new insertions.
std::deque<StreamSlotTy> Slots;
/// The next available slot on the queue. This is reset to zero each time the
/// stream is synchronized. It also indicates the current number of consumed
/// slots at a given time.
uint32_t NextSlot;
/// The synchronization id. This number is increased each time the stream is
/// synchronized. It is useful to detect if an AMDGPUEventTy points to an
/// operation that was already finalized in a previous stream sycnhronize.
uint32_t SyncCycle;
/// Mutex to protect stream's management.
mutable std::mutex Mutex;
/// Return the current number of asychronous operations on the stream.
uint32_t size() const { return NextSlot; }
/// Return the last valid slot on the stream.
uint32_t last() const { return size() - 1; }
/// Consume one slot from the stream. Since the stream uses signals on demand
/// and releases them once the slot is no longer used, the function requires
/// an idle signal for the new consumed slot.
std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
// Double the stream size if needed. Since we use std::deque, this operation
// does not invalidate the already added slots.
if (Slots.size() == NextSlot)
Slots.resize(Slots.size() * 2);
// Update the next available slot and the stream size.
uint32_t Curr = NextSlot++;
// Retrieve the input signal, if any, of the current operation.
AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
// Set the output signal of the current slot.
Slots[Curr].Signal = OutputSignal;
return std::make_pair(Curr, InputSignal);
}
/// Complete all pending post actions and reset the stream after synchronizing
/// or positively querying the stream.
Error complete() {
for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
// Take the post action of the operation if any.
if (auto Err = Slots[Slot].performAction())
return Err;
// Release the slot's signal if possible. Otherwise, another user will.
if (Slots[Slot].Signal->decreaseUseCount())
SignalManager.returnResource(Slots[Slot].Signal);
Slots[Slot].Signal = nullptr;
}
// Reset the stream slots to zero.
NextSlot = 0;
// Increase the synchronization id since the stream completed a sync cycle.
SyncCycle += 1;
return Plugin::success();
}
/// Make the current stream wait on a specific operation of another stream.
/// The idea is to make the current stream waiting on two signals: 1) the last
/// signal of the current stream, and 2) the last signal of the other stream.
/// Use a barrier packet with two input signals.
Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
/// The signal that we must wait from the other stream.
AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
// Prevent the release of the other stream's signal.
OtherSignal->increaseUseCount();
// Retrieve an available signal for the operation's output.
AMDGPUSignalTy *OutputSignal = SignalManager.getResource();
OutputSignal->reset();
OutputSignal->increaseUseCount();
// Consume stream slot and compute dependencies.
auto [Curr, InputSignal] = consume(OutputSignal);
// Setup the post action to release the signal.
if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
return Err;
// Push a barrier into the queue with both input signals.
return Queue.pushBarrier(OutputSignal, InputSignal, OtherSignal);
}
/// Callback for running a specific asynchronous operation. This callback is
/// used for hsa_amd_signal_async_handler. The argument is the operation that
/// should be executed. Notice we use the post action mechanism to codify the
/// asynchronous operation.
static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
assert(Slot && "Invalid slot");
assert(Slot->Signal && "Invalid signal");
// This thread is outside the stream mutex. Make sure the thread sees the
// changes on the slot.
std::atomic_thread_fence(std::memory_order_acquire);
// Peform the operation.
if (auto Err = Slot->performAction())
FATAL_MESSAGE(1, "Error peforming post action: %s",
toString(std::move(Err)).data());
// Signal the output signal to notify the asycnhronous operation finalized.
Slot->Signal->signal();
// Unregister callback.
return false;
}
// Callback for host-to-host memory copies.
static Error memcpyAction(void *Data) {
MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->Dst && "Invalid destination buffer");
assert(Args->Src && "Invalid source buffer");
std::memcpy(Args->Dst, Args->Src, Args->Size);
return Plugin::success();
}
// Callback for releasing a memory buffer to a memory manager.
static Error releaseBufferAction(void *Data) {
ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->MemoryManager && "Invalid memory manager");
assert(Args->Buffer && "Invalid buffer");
// Release the allocation to the memory manager.
return Args->MemoryManager->deallocate(Args->Buffer);
}
static Error releaseSignalAction(void *Data) {
ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->Signal && "Invalid signal");
assert(Args->SignalManager && "Invalid signal manager");
// Release the signal if needed.
if (Args->Signal->decreaseUseCount())
Args->SignalManager->returnResource(Args->Signal);
return Plugin::success();
}
public:
/// Create an empty stream associated with a specific device.
AMDGPUStreamTy(AMDGPUDeviceTy &Device);
/// Intialize the stream's signals.
Error init() { return Plugin::success(); }
/// Deinitialize the stream's signals.