Skip to content

Commit c29db84

Browse files
committed
[CUDA] Added a wrapper header for inclusion of stock CUDA headers.
Header files that come with CUDA are assuming split host/device compilation and are not usable by clang out of the box. With a bit of preprocessor magic it's possible to twist them into something clang can use. This wrapper always includes CUDA headers exactly the same way during host and device compilation passes and produces identical preprocessed content during host and device side compilation for sm_35 GPUs. Device compilation passes for older GPUs will see a smaller subset of device functions supported by particular GPU. The wrapper assumes specific contents of CUDA header files and works only with CUDA 7.0 and 7.5. Differential Revision: http://reviews.llvm.org/D13171 llvm-svn: 253388
1 parent 34f481a commit c29db84

File tree

2 files changed

+180
-0
lines changed

2 files changed

+180
-0
lines changed

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ set(files
1717
bmiintrin.h
1818
cpuid.h
1919
cuda_builtin_vars.h
20+
cuda_runtime.h
2021
emmintrin.h
2122
f16cintrin.h
2223
float.h

clang/lib/Headers/cuda_runtime.h

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*===---- cuda_runtime.h - CUDA runtime support ----------------------------===
2+
*
3+
* Permission is hereby granted, free of charge, to any person obtaining a copy
4+
* of this software and associated documentation files (the "Software"), to deal
5+
* in the Software without restriction, including without limitation the rights
6+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
* copies of the Software, and to permit persons to whom the Software is
8+
* furnished to do so, subject to the following conditions:
9+
*
10+
* The above copyright notice and this permission notice shall be included in
11+
* all copies or substantial portions of the Software.
12+
*
13+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19+
* THE SOFTWARE.
20+
*
21+
*===-----------------------------------------------------------------------===
22+
*/
23+
24+
#ifndef __CLANG_CUDA_RUNTIME_H__
25+
#define __CLANG_CUDA_RUNTIME_H__
26+
27+
#if defined(__CUDA__) && defined(__clang__)
28+
29+
// Include some standard headers to avoid CUDA headers including them
30+
// while some required macros (like __THROW) are in a weird state.
31+
#include <stdlib.h>
32+
33+
// Preserve common macros that will be changed below by us or by CUDA
34+
// headers.
35+
#pragma push_macro("__THROW")
36+
#pragma push_macro("__CUDA_ARCH__")
37+
38+
// WARNING: Preprocessor hacks below are based on specific of
39+
// implementation of CUDA-7.x headers and are expected to break with
40+
// any other version of CUDA headers.
41+
#include "cuda.h"
42+
#if !defined(CUDA_VERSION)
43+
#error "cuda.h did not define CUDA_VERSION"
44+
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
45+
#error "Unsupported CUDA version!"
46+
#endif
47+
48+
// Make largest subset of device functions available during host
49+
// compilation -- SM_35 for the time being.
50+
#ifndef __CUDA_ARCH__
51+
#define __CUDA_ARCH__ 350
52+
#endif
53+
54+
#include "cuda_builtin_vars.h"
55+
56+
// No need for device_launch_parameters.h as cuda_builtin_vars.h above
57+
// has taken care of builtin variables declared in the file.
58+
#define __DEVICE_LAUNCH_PARAMETERS_H__
59+
60+
// {math,device}_functions.h only have declarations of the
61+
// functions. We don't need them as we're going to pull in their
62+
// definitions from .hpp files.
63+
#define __DEVICE_FUNCTIONS_H__
64+
#define __MATH_FUNCTIONS_H__
65+
66+
#undef __CUDACC__
67+
#define __CUDABE__
68+
// Disables definitions of device-side runtime support stubs in
69+
// cuda_device_runtime_api.h
70+
#define __CUDADEVRT_INTERNAL__
71+
#include "host_config.h"
72+
#include "host_defines.h"
73+
#include "driver_types.h"
74+
#include "common_functions.h"
75+
#undef __CUDADEVRT_INTERNAL__
76+
77+
#undef __CUDABE__
78+
#define __CUDACC__
79+
#include_next "cuda_runtime.h"
80+
81+
#undef __CUDACC__
82+
#define __CUDABE__
83+
84+
// CUDA headers use __nvvm_memcpy and __nvvm_memset which clang does
85+
// not have at the moment. Emulate them with a builtin memcpy/memset.
86+
#define __nvvm_memcpy(s,d,n,a) __builtin_memcpy(s,d,n)
87+
#define __nvvm_memset(d,c,n,a) __builtin_memset(d,c,n)
88+
89+
#include "crt/host_runtime.h"
90+
#include "crt/device_runtime.h"
91+
// device_runtime.h defines __cxa_* macros that will conflict with
92+
// cxxabi.h.
93+
// FIXME: redefine these as __device__ functions.
94+
#undef __cxa_vec_ctor
95+
#undef __cxa_vec_cctor
96+
#undef __cxa_vec_dtor
97+
#undef __cxa_vec_new2
98+
#undef __cxa_vec_new3
99+
#undef __cxa_vec_delete2
100+
#undef __cxa_vec_delete
101+
#undef __cxa_vec_delete3
102+
#undef __cxa_pure_virtual
103+
104+
// We need decls for functions in CUDA's libdevice woth __device__
105+
// attribute only. Alas they come either as __host__ __device__ or
106+
// with no attributes at all. To work around that, define __CUDA_RTC__
107+
// which produces HD variant and undef __host__ which gives us desided
108+
// decls with __device__ attribute.
109+
#pragma push_macro("__host__")
110+
#define __host__
111+
#define __CUDACC_RTC__
112+
#include "device_functions_decls.h"
113+
#undef __CUDACC_RTC__
114+
115+
// Temporarily poison __host__ macro to ensure it's not used by any of
116+
// the headers we're about to include.
117+
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
118+
119+
// device_functions.hpp and math_functions*.hpp use 'static
120+
// __forceinline__' (with no __device__) for definitions of device
121+
// functions. Temporarily redefine __forceinline__ to include
122+
// __device__.
123+
#pragma push_macro("__forceinline__")
124+
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
125+
#include "device_functions.hpp"
126+
#include "math_functions.hpp"
127+
#include "math_functions_dbl_ptx3.hpp"
128+
#pragma pop_macro("__forceinline__")
129+
130+
// For some reason single-argument variant is not always declared by
131+
// CUDA headers. Alas, device_functions.hpp included below needs it.
132+
static inline __device__ void __brkpt(int c) { __brkpt(); }
133+
134+
// Now include *.hpp with definitions of various GPU functions. Alas,
135+
// a lot of thins get declared/defined with __host__ attribute which
136+
// we don't want and we have to define it out. We also have to include
137+
// {device,math}_functions.hpp again in order to extract the other
138+
// branch of #if/else inside.
139+
140+
#define __host__
141+
#undef __CUDABE__
142+
#define __CUDACC__
143+
#undef __DEVICE_FUNCTIONS_HPP__
144+
#include "device_functions.hpp"
145+
#include "device_atomic_functions.hpp"
146+
#include "sm_20_atomic_functions.hpp"
147+
#include "sm_32_atomic_functions.hpp"
148+
#include "sm_20_intrinsics.hpp"
149+
// sm_30_intrinsics.h has declarations that use default argument, so
150+
// we have to include it and it will in turn include .hpp
151+
#include "sm_30_intrinsics.h"
152+
#include "sm_32_intrinsics.hpp"
153+
#undef __MATH_FUNCTIONS_HPP__
154+
#include "math_functions.hpp"
155+
#pragma pop_macro("__host__")
156+
157+
#include "texture_indirect_functions.h"
158+
159+
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
160+
#pragma pop_macro("__CUDA_ARCH__")
161+
#pragma pop_macro("__THROW")
162+
163+
// Set up compiler macros expected to be seen during compilation.
164+
#undef __CUDABE__
165+
#define __CUDACC__
166+
#define __NVCC__
167+
168+
#if defined(__CUDA_ARCH__)
169+
// We need to emit IR declaration for non-existing __nvvm_reflect to
170+
// let backend know that it should be treated as const nothrow
171+
// function which is implicitly assumed by NVVMReflect pass.
172+
extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
173+
static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
174+
return __nvvm_reflect("NONE");
175+
}
176+
#endif
177+
178+
#endif // __CUDA__
179+
#endif // __CLANG_CUDA_RUNTIME_H__

0 commit comments

Comments
 (0)