-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
codegen.py
378 lines (301 loc) · 11.9 KB
/
codegen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
from llvmlite import ir
from numba.core import config, serialize
from numba.core.codegen import Codegen, CodeLibrary
from .cudadrv import devices, driver, nvvm, runtime
from numba.cuda.cudadrv.libs import get_cudalib
import os
import subprocess
import tempfile
CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
def run_nvdisasm(cubin, flags):
# nvdisasm only accepts input from a file, so we need to write out to a
# temp file and clean up afterwards.
fd = None
fname = None
try:
fd, fname = tempfile.mkstemp()
with open(fname, 'wb') as f:
f.write(cubin)
try:
cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except FileNotFoundError as e:
msg = ("nvdisasm has not been found. You may need "
"to install the CUDA toolkit and ensure that "
"it is available on your PATH.\n")
raise RuntimeError(msg) from e
return cp.stdout.decode('utf-8')
finally:
if fd is not None:
os.close(fd)
if fname is not None:
os.unlink(fname)
def disassemble_cubin(cubin):
# Request lineinfo in disassembly
flags = ['-gi']
return run_nvdisasm(cubin, flags)
def disassemble_cubin_for_cfg(cubin):
# Request control flow graph in disassembly
flags = ['-cfg']
return run_nvdisasm(cubin, flags)
class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
"""
The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
compute capabilities. It also loads cubins to multiple devices (via
get_cufunc), which may be of different compute capabilities.
"""
def __init__(self, codegen, name, entry_name=None, max_registers=None,
nvvm_options=None):
"""
codegen:
Codegen object.
name:
Name of the function in the source.
entry_name:
Name of the kernel function in the binary, if this is a global
kernel and not a device function.
max_registers:
The maximum register usage to aim for when linking.
nvvm_options:
Dict of options to pass to NVVM.
"""
super().__init__(codegen, name)
# The llvmlite module for this library.
self._module = None
# CodeLibrary objects that will be "linked" into this library. The
# modules within them are compiled from NVVM IR to PTX along with the
# IR from this module - in that sense they are "linked" by NVVM at PTX
# generation time, rather than at link time.
self._linking_libraries = set()
# Files to link with the generated PTX. These are linked using the
# Driver API at link time.
self._linking_files = set()
# Should we link libcudadevrt?
self.needs_cudadevrt = False
# Cache the LLVM IR string
self._llvm_strs = None
# Maps CC -> PTX string
self._ptx_cache = {}
# Maps CC -> LTO-IR
self._ltoir_cache = {}
# Maps CC -> cubin
self._cubin_cache = {}
# Maps CC -> linker info output for cubin
self._linkerinfo_cache = {}
# Maps Device numeric ID -> cufunc
self._cufunc_cache = {}
self._max_registers = max_registers
if nvvm_options is None:
nvvm_options = {}
self._nvvm_options = nvvm_options
self._entry_name = entry_name
@property
def llvm_strs(self):
if self._llvm_strs is None:
self._llvm_strs = [str(mod) for mod in self.modules]
return self._llvm_strs
def get_llvm_str(self):
return "\n\n".join(self.llvm_strs)
def _ensure_cc(self, cc):
if cc is not None:
return cc
device = devices.get_context().device
return device.compute_capability
def get_asm_str(self, cc=None):
cc = self._ensure_cc(cc)
ptxes = self._ptx_cache.get(cc, None)
if ptxes:
return ptxes
arch = nvvm.get_arch_option(*cc)
options = self._nvvm_options.copy()
options['arch'] = arch
irs = self.llvm_strs
ptx = nvvm.compile_ir(irs, **options)
# Sometimes the result from NVVM contains trailing whitespace and
# nulls, which we strip so that the assembly dump looks a little
# tidier.
ptx = ptx.decode().strip('\x00').strip()
if config.DUMP_ASSEMBLY:
print(("ASSEMBLY %s" % self._name).center(80, '-'))
print(ptx)
print('=' * 80)
self._ptx_cache[cc] = ptx
return ptx
def get_ltoir(self, cc=None):
cc = self._ensure_cc(cc)
ltoir = self._ltoir_cache.get(cc, None)
if ltoir is not None:
return ltoir
arch = nvvm.get_arch_option(*cc)
options = self._nvvm_options.copy()
options['arch'] = arch
options['gen-lto'] = None
irs = self.llvm_strs
ltoir = nvvm.compile_ir(irs, **options)
self._ltoir_cache[cc] = ltoir
return ltoir
def get_cubin(self, cc=None):
cc = self._ensure_cc(cc)
cubin = self._cubin_cache.get(cc, None)
if cubin:
return cubin
linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
if linker.lto:
ltoir = self.get_ltoir(cc=cc)
linker.add_ltoir(ltoir)
else:
ptx = self.get_asm_str(cc=cc)
linker.add_ptx(ptx.encode())
for path in self._linking_files:
linker.add_file_guess_ext(path)
if self.needs_cudadevrt:
linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
cubin = linker.complete()
self._cubin_cache[cc] = cubin
self._linkerinfo_cache[cc] = linker.info_log
return cubin
def get_cufunc(self):
if self._entry_name is None:
msg = "Missing entry_name - are you trying to get the cufunc " \
"for a device function?"
raise RuntimeError(msg)
ctx = devices.get_context()
device = ctx.device
cufunc = self._cufunc_cache.get(device.id, None)
if cufunc:
return cufunc
cubin = self.get_cubin(cc=device.compute_capability)
module = ctx.create_module_image(cubin)
# Load
cufunc = module.get_function(self._entry_name)
# Populate caches
self._cufunc_cache[device.id] = cufunc
return cufunc
def get_linkerinfo(self, cc):
try:
return self._linkerinfo_cache[cc]
except KeyError:
raise KeyError(f'No linkerinfo for CC {cc}')
def get_sass(self, cc=None):
return disassemble_cubin(self.get_cubin(cc=cc))
def get_sass_cfg(self, cc=None):
return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
def add_ir_module(self, mod):
self._raise_if_finalized()
if self._module is not None:
raise RuntimeError('CUDACodeLibrary only supports one module')
self._module = mod
def add_linking_library(self, library):
library._ensure_finalized()
# We don't want to allow linking more libraries in after finalization
# because our linked libraries are modified by the finalization, and we
# won't be able to finalize again after adding new ones
self._raise_if_finalized()
self._linking_libraries.add(library)
def add_linking_file(self, filepath):
self._linking_files.add(filepath)
def get_function(self, name):
for fn in self._module.functions:
if fn.name == name:
return fn
raise KeyError(f'Function {name} not found')
@property
def modules(self):
return [self._module] + [mod for lib in self._linking_libraries
for mod in lib.modules]
@property
def linking_libraries(self):
# Libraries we link to may link to other libraries, so we recursively
# traverse the linking libraries property to build up a list of all
# linked libraries.
libs = []
for lib in self._linking_libraries:
libs.extend(lib.linking_libraries)
libs.append(lib)
return libs
def finalize(self):
# Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
# we only adjust the linkage of functions. Global kernels (with
# external linkage) have their linkage untouched. Device functions are
# set linkonce_odr to prevent them appearing in the PTX.
self._raise_if_finalized()
# Note in-place modification of the linkage of functions in linked
# libraries. This presently causes no issues as only device functions
# are shared across code libraries, so they would always need their
# linkage set to linkonce_odr. If in a future scenario some code
# libraries require linkonce_odr linkage of functions in linked
# modules, and another code library requires another linkage, each code
# library will need to take its own private copy of its linked modules.
#
# See also discussion on PR #890:
# https://github.com/numba/numba/pull/890
for library in self._linking_libraries:
for mod in library.modules:
for fn in mod.functions:
if not fn.is_declaration:
fn.linkage = 'linkonce_odr'
self._finalized = True
def _reduce_states(self):
"""
Reduce the instance for serialization. We retain the PTX and cubins,
but loaded functions are discarded. They are recreated when needed
after deserialization.
"""
if self._linking_files:
msg = 'Cannot pickle CUDACodeLibrary with linking files'
raise RuntimeError(msg)
if not self._finalized:
raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
return dict(
codegen=None,
name=self.name,
entry_name=self._entry_name,
llvm_strs=self.llvm_strs,
ptx_cache=self._ptx_cache,
cubin_cache=self._cubin_cache,
linkerinfo_cache=self._linkerinfo_cache,
max_registers=self._max_registers,
nvvm_options=self._nvvm_options,
needs_cudadevrt=self.needs_cudadevrt
)
@classmethod
def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
needs_cudadevrt):
"""
Rebuild an instance.
"""
instance = cls(codegen, name, entry_name=entry_name)
instance._llvm_strs = llvm_strs
instance._ptx_cache = ptx_cache
instance._cubin_cache = cubin_cache
instance._linkerinfo_cache = linkerinfo_cache
instance._max_registers = max_registers
instance._nvvm_options = nvvm_options
instance.needs_cudadevrt = needs_cudadevrt
instance._finalized = True
return instance
class JITCUDACodegen(Codegen):
"""
This codegen implementation for CUDA only generates optimized LLVM IR.
Generation of PTX code is done separately (see numba.cuda.compiler).
"""
_library_class = CUDACodeLibrary
def __init__(self, module_name):
pass
def _create_empty_module(self, name):
ir_module = ir.Module(name)
ir_module.triple = CUDA_TRIPLE
ir_module.data_layout = nvvm.NVVM().data_layout
nvvm.add_ir_version(ir_module)
return ir_module
def _add_module(self, module):
pass
def magic_tuple(self):
"""
Return a tuple unambiguously describing the codegen behaviour.
"""
ctx = devices.get_context()
cc = ctx.device.compute_capability
return (runtime.runtime.get_version(), cc)