/
driver.py
2333 lines (1879 loc) · 74.5 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
CUDA driver bridge implementation
NOTE:
The new driver implementation uses a *_PendingDeallocs* that help prevents a
crashing the system (particularly OSX) when the CUDA context is corrupted at
resource deallocation. The old approach ties resource management directly
into the object destructor; thus, at corruption of the CUDA context,
subsequent deallocation could further corrupt the CUDA context and causes the
system to freeze in some cases.
"""
import sys
import os
import ctypes
import weakref
import functools
import copy
import warnings
import logging
import threading
from itertools import product
from abc import ABCMeta, abstractmethod
from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
c_void_p, c_float)
import contextlib
import importlib
import numpy as np
from collections import namedtuple, deque
from numba import mviewbuf
from numba.core import utils, errors, serialize, config
from .error import CudaSupportError, CudaDriverError
from .drvapi import API_PROTOTYPES
from .drvapi import cu_occupancy_b2d_size
from numba.cuda.cudadrv import enums, drvapi, _extras
from numba.core.utils import longint as long
from numba.cuda.envvars import get_numba_envvar
VERBOSE_JIT_LOG = int(get_numba_envvar('VERBOSE_CU_JIT_LOG', 1))
MIN_REQUIRED_CC = (2, 0)
SUPPORTS_IPC = sys.platform.startswith('linux')
def _make_logger():
logger = logging.getLogger(__name__)
# is logging configured?
if not logger.hasHandlers():
# read user config
lvl = str(config.CUDA_LOG_LEVEL).upper()
lvl = getattr(logging, lvl, None)
if not isinstance(lvl, int):
# default to critical level
lvl = logging.CRITICAL
logger.setLevel(lvl)
# did user specify a level?
if config.CUDA_LOG_LEVEL:
# create a simple handler that prints to stderr
handler = logging.StreamHandler(sys.stderr)
fmt = '== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s'
handler.setFormatter(logging.Formatter(fmt=fmt))
logger.addHandler(handler)
else:
# otherwise, put a null handler
logger.addHandler(logging.NullHandler())
return logger
class DeadMemoryError(RuntimeError):
pass
class LinkerError(RuntimeError):
pass
class CudaAPIError(CudaDriverError):
def __init__(self, code, msg):
self.code = code
self.msg = msg
super(CudaAPIError, self).__init__(code, msg)
def __str__(self):
return "[%s] %s" % (self.code, self.msg)
def find_driver():
envpath = get_numba_envvar('CUDA_DRIVER')
if envpath == '0':
# Force fail
_raise_driver_not_found()
# Determine DLL type
if sys.platform == 'win32':
dlloader = ctypes.WinDLL
dldir = ['\\windows\\system32']
dlnames = ['nvcuda.dll']
elif sys.platform == 'darwin':
dlloader = ctypes.CDLL
dldir = ['/usr/local/cuda/lib']
dlnames = ['libcuda.dylib']
else:
# Assume to be *nix like
dlloader = ctypes.CDLL
dldir = ['/usr/lib', '/usr/lib64']
dlnames = ['libcuda.so', 'libcuda.so.1']
if envpath is not None:
try:
envpath = os.path.abspath(envpath)
except ValueError:
raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid path" %
envpath)
if not os.path.isfile(envpath):
raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid file "
"path. Note it must be a filepath of the .so/"
".dll/.dylib or the driver" % envpath)
candidates = [envpath]
else:
# First search for the name in the default library path.
# If that is not found, try the specific path.
candidates = dlnames + [os.path.join(x, y)
for x, y in product(dldir, dlnames)]
# Load the driver; Collect driver error information
path_not_exist = []
driver_load_error = []
for path in candidates:
try:
dll = dlloader(path)
except OSError as e:
# Problem opening the DLL
path_not_exist.append(not os.path.isfile(path))
driver_load_error.append(e)
else:
return dll
# Problem loading driver
if all(path_not_exist):
_raise_driver_not_found()
else:
errmsg = '\n'.join(str(e) for e in driver_load_error)
_raise_driver_error(errmsg)
DRIVER_NOT_FOUND_MSG = """
CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
"""
DRIVER_LOAD_ERROR_MSG = """
Possible CUDA driver libraries are found but error occurred during load:
%s
"""
def _raise_driver_not_found():
raise CudaSupportError(DRIVER_NOT_FOUND_MSG)
def _raise_driver_error(e):
raise CudaSupportError(DRIVER_LOAD_ERROR_MSG % e)
def _build_reverse_error_map():
prefix = 'CUDA_ERROR'
map = utils.UniqueDict()
for name in dir(enums):
if name.startswith(prefix):
code = getattr(enums, name)
map[code] = name
return map
def _getpid():
return os.getpid()
ERROR_MAP = _build_reverse_error_map()
MISSING_FUNCTION_ERRMSG = """driver missing function: %s.
Requires CUDA 8.0 or above.
"""
class Driver(object):
"""
Driver API functions are lazily bound.
"""
_singleton = None
def __new__(cls):
obj = cls._singleton
if obj is not None:
return obj
else:
obj = object.__new__(cls)
cls._singleton = obj
return obj
def __init__(self):
self.devices = utils.UniqueDict()
self.is_initialized = False
self.initialization_error = None
self.pid = None
try:
if config.DISABLE_CUDA:
msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
"in the environment, or because CUDA is unsupported on "
"32-bit systems.")
raise CudaSupportError(msg)
self.lib = find_driver()
except CudaSupportError as e:
self.is_initialized = True
self.initialization_error = e
def initialize(self):
# lazily initialize logger
global _logger
_logger = _make_logger()
self.is_initialized = True
try:
_logger.info('init')
self.cuInit(0)
except CudaAPIError as e:
self.initialization_error = e
raise CudaSupportError("Error at driver init: \n%s:" % e)
else:
self.pid = _getpid()
self._initialize_extras()
def _initialize_extras(self):
# set pointer to original cuIpcOpenMemHandle
set_proto = ctypes.CFUNCTYPE(None, c_void_p)
set_cuIpcOpenMemHandle = set_proto(_extras.set_cuIpcOpenMemHandle)
set_cuIpcOpenMemHandle(self._find_api('cuIpcOpenMemHandle'))
# bind caller to cuIpcOpenMemHandle that fixes the ABI
call_proto = ctypes.CFUNCTYPE(c_int,
ctypes.POINTER(drvapi.cu_device_ptr),
ctypes.POINTER(drvapi.cu_ipc_mem_handle),
ctypes.c_uint)
call_cuIpcOpenMemHandle = call_proto(_extras.call_cuIpcOpenMemHandle)
call_cuIpcOpenMemHandle.__name__ = 'call_cuIpcOpenMemHandle'
safe_call = self._wrap_api_call('call_cuIpcOpenMemHandle',
call_cuIpcOpenMemHandle)
# override cuIpcOpenMemHandle
self.cuIpcOpenMemHandle = safe_call
@property
def is_available(self):
if not self.is_initialized:
self.initialize()
return self.initialization_error is None
def __getattr__(self, fname):
# First request of a driver API function
try:
proto = API_PROTOTYPES[fname]
except KeyError:
raise AttributeError(fname)
restype = proto[0]
argtypes = proto[1:]
# Initialize driver
if not self.is_initialized:
self.initialize()
if self.initialization_error is not None:
raise CudaSupportError("Error at driver init: \n%s:" %
self.initialization_error)
# Find function in driver library
libfn = self._find_api(fname)
libfn.restype = restype
libfn.argtypes = argtypes
safe_call = self._wrap_api_call(fname, libfn)
setattr(self, fname, safe_call)
return safe_call
def _wrap_api_call(self, fname, libfn):
@functools.wraps(libfn)
def safe_cuda_api_call(*args):
_logger.debug('call driver api: %s', libfn.__name__)
retcode = libfn(*args)
self._check_error(fname, retcode)
return safe_cuda_api_call
def _find_api(self, fname):
# Try version 2
try:
return getattr(self.lib, fname + "_v2")
except AttributeError:
pass
# Try regular
try:
return getattr(self.lib, fname)
except AttributeError:
pass
# Not found.
# Delay missing function error to use
def absent_function(*args, **kws):
raise CudaDriverError(MISSING_FUNCTION_ERRMSG % fname)
setattr(self, fname, absent_function)
return absent_function
def _check_error(self, fname, retcode):
if retcode != enums.CUDA_SUCCESS:
errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
msg = "Call to %s results in %s" % (fname, errname)
_logger.error(msg)
if retcode == enums.CUDA_ERROR_NOT_INITIALIZED:
# Detect forking
if self.pid is not None and _getpid() != self.pid:
msg = 'pid %s forked from pid %s after CUDA driver init'
_logger.critical(msg, _getpid(), self.pid)
raise CudaDriverError("CUDA initialized before forking")
raise CudaAPIError(retcode, msg)
def get_device(self, devnum=0):
dev = self.devices.get(devnum)
if dev is None:
dev = Device(devnum)
self.devices[devnum] = dev
return weakref.proxy(dev)
def get_device_count(self):
count = c_int()
self.cuDeviceGetCount(byref(count))
return count.value
def list_devices(self):
"""Returns a list of active devices
"""
return list(self.devices.values())
def reset(self):
"""Reset all devices
"""
for dev in self.devices.values():
dev.reset()
def pop_active_context(self):
"""Pop the active CUDA context and return the handle.
If no CUDA context is active, return None.
"""
with self.get_active_context() as ac:
if ac.devnum is not None:
popped = drvapi.cu_context()
driver.cuCtxPopCurrent(byref(popped))
return popped
def get_active_context(self):
"""Returns an instance of ``_ActiveContext``.
"""
return _ActiveContext()
class _ActiveContext(object):
"""An contextmanager object to cache active context to reduce dependency
on querying the CUDA driver API.
Once entering the context, it is assumed that the active CUDA context is
not changed until the context is exited.
"""
_tls_cache = threading.local()
def __enter__(self):
is_top = False
# check TLS cache
if hasattr(self._tls_cache, 'ctx_devnum'):
hctx, devnum = self._tls_cache.ctx_devnum
# Not cached. Query the driver API.
else:
hctx = drvapi.cu_context(0)
driver.cuCtxGetCurrent(byref(hctx))
hctx = hctx if hctx.value else None
if hctx is None:
devnum = None
else:
hdevice = drvapi.cu_device()
driver.cuCtxGetDevice(byref(hdevice))
devnum = hdevice.value
self._tls_cache.ctx_devnum = (hctx, devnum)
is_top = True
self._is_top = is_top
self.context_handle = hctx
self.devnum = devnum
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self._is_top:
delattr(self._tls_cache, 'ctx_devnum')
def __bool__(self):
"""Returns True is there's a valid and active CUDA context.
"""
return self.context_handle is not None
__nonzero__ = __bool__
driver = Driver()
def _build_reverse_device_attrs():
prefix = "CU_DEVICE_ATTRIBUTE_"
map = utils.UniqueDict()
for name in dir(enums):
if name.startswith(prefix):
map[name[len(prefix):]] = getattr(enums, name)
return map
DEVICE_ATTRIBUTES = _build_reverse_device_attrs()
class Device(object):
"""
The device object owns the CUDA contexts. This is owned by the driver
object. User should not construct devices directly.
"""
@classmethod
def from_identity(self, identity):
"""Create Device object from device identity created by
``Device.get_device_identity()``.
"""
for devid in range(driver.get_device_count()):
d = driver.get_device(devid)
if d.get_device_identity() == identity:
return d
else:
errmsg = (
"No device of {} is found. "
"Target device may not be visible in this process."
).format(identity)
raise RuntimeError(errmsg)
def __init__(self, devnum):
got_devnum = c_int()
driver.cuDeviceGet(byref(got_devnum), devnum)
assert devnum == got_devnum.value, "Driver returned another device"
self.id = got_devnum.value
self.attributes = {}
# Read compute capability
cc_major = c_int()
cc_minor = c_int()
driver.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor),
self.id)
self.compute_capability = (cc_major.value, cc_minor.value)
# Read name
bufsz = 128
buf = (c_char * bufsz)()
driver.cuDeviceGetName(buf, bufsz, self.id)
self.name = buf.value
self.primary_context = None
def get_device_identity(self):
return {
'pci_domain_id': self.PCI_DOMAIN_ID,
'pci_bus_id': self.PCI_BUS_ID,
'pci_device_id': self.PCI_DEVICE_ID,
}
@property
def COMPUTE_CAPABILITY(self):
"""
For backward compatibility
"""
warnings.warn("Deprecated attribute 'COMPUTE_CAPABILITY'; use lower "
"case version", DeprecationWarning)
return self.compute_capability
def __repr__(self):
return "<CUDA device %d '%s'>" % (self.id, self.name)
def __getattr__(self, attr):
"""Read attributes lazily
"""
try:
code = DEVICE_ATTRIBUTES[attr]
except KeyError:
raise AttributeError(attr)
value = c_int()
driver.cuDeviceGetAttribute(byref(value), code, self.id)
setattr(self, attr, value.value)
return value.value
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
if isinstance(other, Device):
return self.id == other.id
return False
def __ne__(self, other):
return not (self == other)
def get_primary_context(self):
"""
Returns the primary context for the device.
Note: it is not pushed to the CPU thread.
"""
if self.primary_context is not None:
return self.primary_context
met_requirement_for_device(self)
# create primary context
hctx = drvapi.cu_context()
driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id)
ctx = Context(weakref.proxy(self), hctx)
self.primary_context = ctx
return ctx
def release_primary_context(self):
"""
Release reference to primary context if it has been retained.
"""
if self.primary_context:
driver.cuDevicePrimaryCtxRelease(self.id)
self.primary_context = None
def reset(self):
try:
if self.primary_context is not None:
self.primary_context.reset()
self.release_primary_context()
finally:
# reset at the driver level
driver.cuDevicePrimaryCtxReset(self.id)
def met_requirement_for_device(device):
if device.compute_capability < MIN_REQUIRED_CC:
raise CudaSupportError("%s has compute capability < %s" %
(device, MIN_REQUIRED_CC))
class BaseCUDAMemoryManager(object, metaclass=ABCMeta):
"""Abstract base class for External Memory Management (EMM) Plugins."""
def __init__(self, *args, **kwargs):
if 'context' not in kwargs:
raise RuntimeError("Memory manager requires a context")
self.context = kwargs.pop('context')
@abstractmethod
def memalloc(self, size):
"""
Allocate on-device memory in the current context.
:param size: Size of allocation in bytes
:type size: int
:return: A memory pointer instance that owns the allocated memory
:rtype: :class:`MemoryPointer`
"""
@abstractmethod
def memhostalloc(self, size, mapped, portable, wc):
"""
Allocate pinned host memory.
:param size: Size of the allocation in bytes
:type size: int
:param mapped: Whether the allocated memory should be mapped into the CUDA
address space.
:type mapped: bool
:param portable: Whether the memory will be considered pinned by all
contexts, and not just the calling context.
:type portable: bool
:param wc: Whether to allocate the memory as write-combined.
:type wc: bool
:return: A memory pointer instance that owns the allocated memory. The
return type depends on whether the region was mapped into
device memory.
:rtype: :class:`MappedMemory` or :class:`PinnedMemory`
"""
@abstractmethod
def mempin(self, owner, pointer, size, mapped):
"""
Pin a region of host memory that is already allocated.
:param owner: The object that owns the memory.
:param pointer: The pointer to the beginning of the region to pin.
:type pointer: int
:param size: The size of the region in bytes.
:type size: int
:param mapped: Whether the region should also be mapped into device memory.
:type mapped: bool
:return: A memory pointer instance that refers to the allocated
memory.
:rtype: :class:`MappedMemory` or :class:`PinnedMemory`
"""
@abstractmethod
def initialize(self):
"""
Perform any initialization required for the EMM plugin instance to be
ready to use.
:return: None
"""
@abstractmethod
def get_ipc_handle(self, memory):
"""
Return an IPC handle from a GPU allocation.
:param memory: Memory for which the IPC handle should be created.
:type memory: :class:`MemoryPointer`
:return: IPC handle for the allocation
:rtype: :class:`IpcHandle`
"""
@abstractmethod
def get_memory_info(self):
"""
Returns ``(free, total)`` memory in bytes in the context. May raise
:class:`NotImplementedError`, if returning such information is not
practical (e.g. for a pool allocator).
:return: Memory info
:rtype: :class:`MemoryInfo`
"""
@abstractmethod
def reset(self):
"""
Clears up all memory allocated in this context.
:return: None
"""
@abstractmethod
def defer_cleanup(self):
"""
Returns a context manager that ensures the implementation of deferred
cleanup whilst it is active.
:return: Context manager
"""
@property
@abstractmethod
def interface_version(self):
"""
Returns an integer specifying the version of the EMM Plugin interface
supported by the plugin implementation. Should always return 1 for
implementations of this version of the specification.
"""
class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
"""Base class for External Memory Management (EMM) Plugins that only
implement on-device allocation. A subclass need not implement the
``memhostalloc`` and ``mempin`` methods.
This class also implements ``reset`` and ``defer_cleanup`` (see
:class:`numba.cuda.BaseCUDAMemoryManager`) for its own internal state
management. If an EMM Plugin based on this class also implements these
methods, then its implementations of these must also call the method from
``super()`` to give ``HostOnlyCUDAMemoryManager`` an opportunity to do the
necessary work for the host allocations it is managing.
This class does not implement ``interface_version``, as it will always be
consistent with the version of Numba in which it is implemented. An EMM
Plugin subclassing this class should implement ``interface_version``
instead.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.allocations = utils.UniqueDict()
self.deallocations = _PendingDeallocs()
def _attempt_allocation(self, allocator):
"""
Attempt allocation by calling *allocator*. If an out-of-memory error
is raised, the pending deallocations are flushed and the allocation
is retried. If it fails in the second attempt, the error is reraised.
"""
try:
allocator()
except CudaAPIError as e:
# is out-of-memory?
if e.code == enums.CUDA_ERROR_OUT_OF_MEMORY:
# clear pending deallocations
self.deallocations.clear()
# try again
allocator()
else:
raise
def memhostalloc(self, size, mapped=False, portable=False,
wc=False):
"""Implements the allocation of pinned host memory.
It is recommended that this method is not overridden by EMM Plugin
implementations - instead, use the :class:`BaseCUDAMemoryManager`.
"""
pointer = c_void_p()
flags = 0
if mapped:
flags |= enums.CU_MEMHOSTALLOC_DEVICEMAP
if portable:
flags |= enums.CU_MEMHOSTALLOC_PORTABLE
if wc:
flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
def allocator():
driver.cuMemHostAlloc(byref(pointer), size, flags)
if mapped:
self._attempt_allocation(allocator)
else:
allocator()
finalizer = _hostalloc_finalizer(self, pointer, size, mapped)
ctx = weakref.proxy(self.context)
if mapped:
mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
self.allocations[mem.handle.value] = mem
return mem.own()
else:
return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
def mempin(self, owner, pointer, size, mapped=False):
"""Implements the pinning of host memory.
It is recommended that this method is not overridden by EMM Plugin
implementations - instead, use the :class:`BaseCUDAMemoryManager`.
"""
if isinstance(pointer, (int, long)):
pointer = c_void_p(pointer)
# possible flags are "portable" (between context)
# and "device-map" (map host memory to device thus no need
# for memory transfer).
flags = 0
if mapped:
flags |= enums.CU_MEMHOSTREGISTER_DEVICEMAP
def allocator():
driver.cuMemHostRegister(pointer, size, flags)
if mapped:
self._attempt_allocation(allocator)
else:
allocator()
finalizer = _pin_finalizer(self, pointer, mapped)
ctx = weakref.proxy(self.context)
if mapped:
mem = MappedMemory(ctx, pointer, size, owner=owner,
finalizer=finalizer)
self.allocations[mem.handle.value] = mem
return mem.own()
else:
return PinnedMemory(ctx, pointer, size, owner=owner,
finalizer=finalizer)
def reset(self):
"""Clears up all host memory (mapped and/or pinned) in the current
context.
EMM Plugins that override this method must call ``super().reset()`` to
ensure that host allocations are also cleaned up."""
self.allocations.clear()
self.deallocations.clear()
@contextlib.contextmanager
def defer_cleanup(self):
"""Returns a context manager that disables cleanup of mapped or pinned
host memory in the current context whilst it is active.
EMM Plugins that override this method must obtain the context manager
from this method before yielding to ensure that cleanup of host
allocations is also deferred."""
with self.deallocations.disable():
yield
class NumbaCUDAMemoryManager(HostOnlyCUDAMemoryManager):
"""Internal on-device memory management for Numba. This is implemented using
the EMM Plugin interface, but is not part of the public API."""
def initialize(self):
# Set the memory capacity of *deallocations* as the memory manager
# becomes active for the first time
if self.deallocations.memory_capacity == _SizeNotSet:
self.deallocations.memory_capacity = self.get_memory_info().total
def memalloc(self, size):
ptr = drvapi.cu_device_ptr()
def allocator():
driver.cuMemAlloc(byref(ptr), size)
self._attempt_allocation(allocator)
finalizer = _alloc_finalizer(self, ptr, size)
ctx = weakref.proxy(self.context)
mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
self.allocations[ptr.value] = mem
return mem.own()
def get_memory_info(self):
free = c_size_t()
total = c_size_t()
driver.cuMemGetInfo(byref(free), byref(total))
return MemoryInfo(free=free.value, total=total.value)
def get_ipc_handle(self, memory):
base, end = device_extents(memory)
ipchandle = drvapi.cu_ipc_mem_handle()
driver.cuIpcGetMemHandle(byref(ipchandle), base)
source_info = self.context.device.get_device_identity()
offset = memory.handle.value - base
return IpcHandle(memory, ipchandle, memory.size, source_info,
offset=offset)
@property
def interface_version(self):
return _SUPPORTED_EMM_INTERFACE_VERSION
_SUPPORTED_EMM_INTERFACE_VERSION = 1
_memory_manager = None
def _ensure_memory_manager():
global _memory_manager
if _memory_manager:
return
if config.CUDA_MEMORY_MANAGER == 'default':
_memory_manager = NumbaCUDAMemoryManager
return
try:
mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER)
set_memory_manager(mgr_module._numba_memory_manager)
except Exception:
raise RuntimeError("Failed to use memory manager from %s" %
config.CUDA_MEMORY_MANAGER)
def set_memory_manager(mm_plugin):
"""Configure Numba to use an External Memory Management (EMM) Plugin. If
the EMM Plugin version does not match one supported by this version of
Numba, a RuntimeError will be raised.
:param mm_plugin: The class implementing the EMM Plugin.
:type mm_plugin: BaseCUDAMemoryManager
:return: None
"""
global _memory_manager
dummy = mm_plugin(context=None)
iv = dummy.interface_version
if iv != _SUPPORTED_EMM_INTERFACE_VERSION:
err = "EMM Plugin interface has version %d - version %d required" \
% (iv, _SUPPORTED_EMM_INTERFACE_VERSION)
raise RuntimeError(err)
_memory_manager = mm_plugin
class _SizeNotSet(int):
"""
Dummy object for _PendingDeallocs when *size* is not set.
"""
def __new__(cls, *args, **kwargs):
return super().__new__(cls, 0)
def __str__(self):
return '?'
_SizeNotSet = _SizeNotSet()
class _PendingDeallocs(object):
"""
Pending deallocations of a context (or device since we are using the primary
context). The capacity defaults to being unset (_SizeNotSet) but can be
modified later once the driver is initialized and the total memory capacity
known.
"""
def __init__(self, capacity=_SizeNotSet):
self._cons = deque()
self._disable_count = 0
self._size = 0
self.memory_capacity = capacity
@property
def _max_pending_bytes(self):
return int(self.memory_capacity * config.CUDA_DEALLOCS_RATIO)
def add_item(self, dtor, handle, size=_SizeNotSet):
"""
Add a pending deallocation.
The *dtor* arg is the destructor function that takes an argument,
*handle*. It is used as ``dtor(handle)``. The *size* arg is the
byte size of the resource added. It is an optional argument. Some
resources (e.g. CUModule) has an unknown memory footprint on the device.
"""
_logger.info('add pending dealloc: %s %s bytes', dtor.__name__, size)
self._cons.append((dtor, handle, size))
self._size += int(size)
if (len(self._cons) > config.CUDA_DEALLOCS_COUNT or
self._size > self._max_pending_bytes):
self.clear()
def clear(self):
"""
Flush any pending deallocations unless it is disabled.
Do nothing if disabled.
"""
if not self.is_disabled:
while self._cons:
[dtor, handle, size] = self._cons.popleft()
_logger.info('dealloc: %s %s bytes', dtor.__name__, size)
dtor(handle)
self._size = 0
@contextlib.contextmanager
def disable(self):
"""
Context manager to temporarily disable flushing pending deallocation.
This can be nested.
"""
self._disable_count += 1
try:
yield
finally:
self._disable_count -= 1
assert self._disable_count >= 0
@property
def is_disabled(self):
return self._disable_count > 0
def __len__(self):
"""
Returns number of pending deallocations.
"""
return len(self._cons)
MemoryInfo = namedtuple("MemoryInfo", "free,total")
"""Free and total memory for a device.
.. py:attribute:: free
Free device memory in bytes.
.. py:attribute:: total
Total device memory in bytes.
"""
class Context(object):
"""
This object wraps a CUDA Context resource.
Contexts should not be constructed directly by user code.
"""
def __init__(self, device, handle):
self.device = device
self.handle = handle
self.allocations = utils.UniqueDict()