forked from JuliaGPU/CUDA.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
broadcast.jl
112 lines (90 loc) · 3.05 KB
/
broadcast.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# broadcasting
using Base.Broadcast: BroadcastStyle, Broadcasted
struct CuArrayStyle{N} <: AbstractGPUArrayStyle{N} end
CuArrayStyle(::Val{N}) where N = CuArrayStyle{N}()
CuArrayStyle{M}(::Val{N}) where {N,M} = CuArrayStyle{N}()
BroadcastStyle(::Type{CuArray{T,N}}) where {T,N} = CuArrayStyle{N}()
Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}) where {N,T} =
similar(CuArray{T}, axes(bc))
Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}, dims) where {N,T} =
CuArray{T}(undef, dims)
## replace base functions with libdevice alternatives
cufunc(f) = f
cufunc(::Type{T}) where T = (x...) -> T(x...) # broadcasting type ctors isn't GPU compatible
Broadcast.broadcasted(::CuArrayStyle{N}, f, args...) where {N} =
Broadcasted{CuArrayStyle{N}}(cufunc(f), args, nothing)
const device_intrinsics = :[
cos, cospi, sin, sinpi, tan, acos, asin, atan,
cosh, sinh, tanh, acosh, asinh, atanh,
log, log10, log1p, log2, logb, ilogb,
exp, exp2, exp10, expm1, ldexp,
erf, erfinv, erfc, erfcinv, erfcx,
brev, clz, ffs, byte_perm, popc,
isfinite, isinf, isnan, nearbyint,
nextafter, signbit, copysign, abs,
sqrt, rsqrt, cbrt, rcbrt, pow,
ceil, floor, saturate,
lgamma, tgamma,
j0, j1, jn, y0, y1, yn,
normcdf, normcdfinv, hypot,
fma, sad, dim, mul24, mul64hi, hadd, rhadd, scalbn].args
for f in device_intrinsics
isdefined(Base, f) || continue
@eval cufunc(::typeof(Base.$f)) = $f
end
# broadcast ^
culiteral_pow(::typeof(^), x::T, ::Val{0}) where {T<:Real} = one(x)
culiteral_pow(::typeof(^), x::T, ::Val{1}) where {T<:Real} = x
culiteral_pow(::typeof(^), x::T, ::Val{2}) where {T<:Real} = x * x
culiteral_pow(::typeof(^), x::T, ::Val{3}) where {T<:Real} = x * x * x
culiteral_pow(::typeof(^), x::T, ::Val{p}) where {T<:Real,p} = pow(x, Int32(p))
cufunc(::typeof(Base.literal_pow)) = culiteral_pow
cufunc(::typeof(Base.:(^))) = pow
using MacroTools
const _cufuncs = [copy(device_intrinsics); :^]
cufuncs() = (global _cufuncs; _cufuncs)
_cuint(x::Int) = Int32(x)
_cuint(x::Expr) = x.head == :call && x.args[1] == :Int32 && x.args[2] isa Int ? Int32(x.args[2]) : x
_cuint(x) = x
function _cupowliteral(x::Expr)
if x.head == :call && x.args[1] == :(CUDA.cufunc(^)) && x.args[3] isa Int32
num = x.args[3]
if 0 <= num <= 3
sym = gensym(:x)
new_x = Expr(:block, :($sym = $(x.args[2])))
if iszero(num)
push!(new_x.args, :(one($sym)))
else
unroll = Expr(:call, :*)
for x = one(num):num
push!(unroll.args, sym)
end
push!(new_x.args, unroll)
end
x = new_x
end
end
x
end
_cupowliteral(x) = x
function replace_device(ex)
global _cufuncs
MacroTools.postwalk(ex) do x
x = x in _cufuncs ? :(CUDA.cufunc($x)) : x
x = _cuint(x)
x = _cupowliteral(x)
x
end
end
macro cufunc(ex)
global _cufuncs
def = MacroTools.splitdef(ex)
f = def[:name]
def[:name] = Symbol(:cu, f)
def[:body] = replace_device(def[:body])
push!(_cufuncs, f)
quote
$(esc(MacroTools.combinedef(def)))
CUDA.cufunc(::typeof($(esc(f)))) = $(esc(def[:name]))
end
end