-
Notifications
You must be signed in to change notification settings - Fork 10
/
gpu.jl
159 lines (121 loc) · 5.72 KB
/
gpu.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
using CUDA
using CUDA: cufunc, curand_rng
using CUDA.CUSPARSE: CuSparseMatrix, CuSparseMatrixCSR, CuSparseMatrixCOO
using CUDA.CUSOLVER: CuQR
export cuda_gc, gpu, @gpu!, @cu!
const CuBaseField{B,M,T,A<:CuArray} = BaseField{B,M,T,A}
typealias(::Type{CuArray{T,N}}) where {T,N} = "CuArray{$T,$N}"
# a function version of @cuda which can be referenced before CUDA is
# loaded as long as it exists by run-time (unlike the macro @cuda which must
# exist at compile-time)
function cuda(f, args...; threads=256)
@cuda threads=threads f(args...)
end
is_gpu_backed(::BaseField{B,M,T,A}) where {B,M,T,A<:CuArray} = true
global_rng_for(::Type{<:CuArray}) = curand_rng()
# handy conversion functions and macros
@doc doc"""
@gpu! x y
Equivalent to `x = gpu(x)`, `y = gpu(y)`, etc... for any number of
listed variables. See [`gpu`](@ref).
"""
macro gpu!(vars...)
:(begin; $((:($(esc(var)) = gpu($(esc(var)))) for var in vars)...); nothing; end)
end
@doc doc"""
gpu(x)
Recursively moves x to GPU, but unlike `CUDA.cu`, doesn't also convert
to Float32. Equivalent to `adapt_structure(CuArray, x)`. Returns nothing.
"""
gpu(x) = adapt_structure(CuArray, x)
@doc doc"""
@cu! x y
Equivalent to `x = cu(x)`, `y = cu(y)`, etc... for any number of
listed variables. See `CUDA.cu`. Returns nothing.
"""
macro cu!(vars...)
:(begin; $((:($(esc(var)) = cu($(esc(var)))) for var in vars)...); nothing; end)
end
adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj)
function Cℓ_to_2D(Cℓ, proj::ProjLambert{T,<:CuArray}) where {T}
# todo: remove needing to go through cpu here:
gpu(Complex{T}.(nan2zero.(Cℓ.(cpu(proj.ℓmag)))))
end
### misc
# the generic versions of these trigger scalar indexing of CUDA, so provide
# specialized versions:
pinv(D::Diagonal{T,<:CuBaseField}) where {T} = Diagonal(@. ifelse(isfinite(inv(D.diag)), inv(D.diag), $zero(T)))
inv(D::Diagonal{T,<:CuBaseField}) where {T} = any(Array((D.diag.==0)[:])) ? throw(SingularException(-1)) : Diagonal(inv.(D.diag))
fill!(f::CuBaseField, x) = (fill!(f.arr,x); f)
==(a::CuBaseField, b::CuBaseField) = (==)(promote(a.arr, b.arr)...)
sum(f::CuBaseField; dims=:) = (dims == :) ? sum(f.arr) : (1 in dims) ? error("Sum over invalid dims of CuFlatS0.") : f
# these only work for Reals in CUDA
# with these definitions, they work for Complex as well
CUDA.isfinite(x::Complex) = Base.isfinite(x)
CUDA.sqrt(x::Complex) = CUDA.sqrt(CUDA.abs(x)) * CUDA.exp(im*CUDA.angle(x)/2)
CUDA.culiteral_pow(::typeof(^), x::Complex, ::Val{2}) = x * x
CUDA.pow(x::Complex, p) = x^p
# until https://github.com/JuliaGPU/CUDA.jl/pull/618 (CUDA 2.5)
CUDA.cufunc(::typeof(angle)) = CUDA.angle
# adapting of SparseMatrixCSC ↔ CuSparseMatrixCSR (otherwise dense arrays created)
adapt_structure(::Type{<:CuArray}, L::SparseMatrixCSC) = CuSparseMatrixCSR(L)
adapt_structure(::Type{<:Array}, L::CuSparseMatrixCSR) = SparseMatrixCSC(L)
adapt_structure(::Type{<:CuArray}, L::CuSparseMatrixCSR) = L
adapt_structure(::Type{<:Array}, L::SparseMatrixCSC) = L
# CUDA somehow missing this one
# see https://github.com/JuliaGPU/CuArrays.jl/issues/103
# and https://github.com/JuliaGPU/CuArrays.jl/pull/580
ldiv!(qr::CuQR, x::CuVector) = qr.R \ (CuMatrix(qr.Q)' * x)
# some Random API which CUDA doesn't implement yet
Random.randn(rng::CUDA.CURAND.RNG, T::Random.BitFloatType) =
cpu(randn!(rng, CuVector{T}(undef,1)))[1]
# perhaps minor type-piracy, but this lets us simulate into a CuArray using the
# CPU random number generator
Random.randn!(rng::MersenneTwister, A::CuArray) =
(A .= adapt(CuArray, randn!(rng, adapt(Array, A))))
# CUDA makes some copies here as a workaround for JuliaGPU/CuArrays.jl#345 &
# NVIDIA/cuFFT#2714055 but it doesn't appear to be needed in the R2C case, and
# in the C2R case we pre-allocate the memory only once (via memoization) as well
# as doing the copy asynchronously
import CUDA.CUFFT: unsafe_execute!
using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, cufftExecC2R, CUFFT_C2R, unsafe_copyto!, CuDefaultStream, pointer
plan_buffer(x) = plan_buffer(eltype(x),size(x))
@memoize plan_buffer(T,dims) = CuArray{T}(undef,dims...)
function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
) where {K,N}
@assert plan.xtype == CUFFT_R2C
cufftExecR2C(plan, x, y)
end
function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
) where {K,N}
@assert plan.xtype == CUFFT_C2R
cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=CuDefaultStream()), y)
end
# monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436
# until it hits a release
using CUDA.CURAND: curandSetPseudoRandomGeneratorSeed, curandSetGeneratorOffset,
CURAND_STATUS_ALLOCATION_FAILED, CURAND_STATUS_PREEXISTING_FAILURE, CURAND_STATUS_SUCCESS,
unsafe_curandGenerateSeeds, throw_api_error, @retry_reclaim, RNG
function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
curandSetPseudoRandomGeneratorSeed(rng, seed)
curandSetGeneratorOffset(rng, offset)
res = @retry_reclaim err->isequal(err, CURAND_STATUS_ALLOCATION_FAILED) ||
isequal(err, CURAND_STATUS_PREEXISTING_FAILURE) begin
unsafe_curandGenerateSeeds(rng)
end
if res != CURAND_STATUS_SUCCESS
throw_api_error(res)
end
return
end
"""
cuda_gc()
Gargbage collect and reclaim GPU memory (technically should never be
needed to do this by hand, but sometimes helps with GPU OOM errors)
"""
function cuda_gc()
GC.gc(true)
CUDA.reclaim()
end