-
Notifications
You must be signed in to change notification settings - Fork 12
/
chunks.jl
205 lines (169 loc) · 6.33 KB
/
chunks.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
eachchunk(a)
Returns an iterator with `CartesianIndices` elements that mark the index range of each chunk within an array.
"""
function eachchunk end
abstract type ChunkType <: AbstractVector{UnitRange} end
"""
RegularChunks <: ChunkType
Defines chunking along a dimension where the chunks have constant size and a potential
offset for the first chunk. The last chunk is truncated to fit the array size.
"""
struct RegularChunks <: ChunkType
cs::Int
offset::Int
s::Int
end
# Base methods
function Base.getindex(r::RegularChunks, i::Int)
@boundscheck checkbounds(r, i)
return max((i - 1) * r.cs + 1 - r.offset, 1):min(i * r.cs - r.offset, r.s)
end
Base.size(r::RegularChunks, _) = div(r.s + r.offset - 1, r.cs) + 1
Base.size(r::RegularChunks) = (size(r, 1),)
# DiskArrays interface
function subsetchunks(r::RegularChunks, subs::AbstractUnitRange)
snew = length(subs)
newoffset = mod(first(subs) - 1 + r.offset, r.cs)
r = RegularChunks(r.cs, newoffset, snew)
# In case the new chunk is trivial and has length 1, we shorten the chunk size
if length(r) == 1
r = RegularChunks(snew, 0, snew)
end
return r
end
function subsetchunks(r::RegularChunks, subs::AbstractRange)
# This is a method only to make "reverse" work and should error for all other cases
if step(subs) == -1 && first(subs) == r.s && last(subs) == 1
lastlen = length(last(r))
newoffset = r.cs - lastlen
return RegularChunks(r.cs, newoffset, r.s)
end
end
approx_chunksize(r::RegularChunks) = r.cs
grid_offset(r::RegularChunks) = r.offset
max_chunksize(r::RegularChunks) = r.cs
"""
IrregularChunks <: ChunkType
Defines chunks along a dimension where chunk sizes are not constant but arbitrary
"""
struct IrregularChunks <: ChunkType
offsets::Vector{Int}
end
"""
IrregularChunks(; chunksizes)
Returns an IrregularChunks object for the given list of chunk sizes
"""
function IrregularChunks(; chunksizes)
offs = pushfirst!(cumsum(chunksizes), 0)
# push!(offs, last(offs)+1)
return IrregularChunks(offs)
end
# Base methods
function Base.getindex(r::IrregularChunks, i::Int)
@boundscheck checkbounds(r, i)
return (r.offsets[i] + 1):r.offsets[i + 1]
end
Base.size(r::IrregularChunks) = (length(r.offsets) - 1,)
# DiskArrays interface
function subsetchunks(r::IrregularChunks, subs::UnitRange)
c1 = searchsortedfirst(r.offsets, first(subs)) - 1
c2 = searchsortedfirst(r.offsets, last(subs))
offsnew = r.offsets[c1:c2]
firstoffset = first(subs) - r.offsets[c1] - 1
offsnew[end] = last(subs)
offsnew[2:end] .= offsnew[2:end] .- firstoffset
offsnew .= offsnew .- first(offsnew)
return IrregularChunks(offsnew)
end
function approx_chunksize(r::IrregularChunks)
return round(Int, sum(diff(r.offsets)) / (length(r.offsets) - 1))
end
grid_offset(r::IrregularChunks) = 0
max_chunksize(r::IrregularChunks) = maximum(diff(r.offsets))
struct GridChunks{N} <: AbstractArray{NTuple{N,UnitRange{Int64}},N}
chunks::Tuple{Vararg{ChunkType,N}}
end
GridChunks(ct::ChunkType...) = GridChunks(ct)
GridChunks(a, chunksize; offset=(_ -> 0).(size(a))) = GridChunks(size(a), chunksize; offset)
function GridChunks(a::Tuple, chunksize; offset=(_ -> 0).(a))
gcs = map(a, chunksize, offset) do s, cs, of
RegularChunks(cs, of, s)
end
return GridChunks(gcs)
end
# Base methods
function Base.getindex(g::GridChunks{N}, i::Vararg{Int,N}) where {N}
@boundscheck checkbounds(g, i...)
return getindex.(g.chunks, i)
end
Base.size(g::GridChunks) = length.(g.chunks)
# DiskArrays interface
"""
approx_chunksize(g::GridChunks)
Returns the aproximate chunk size of the grid. For the dimension with regular chunks, this will be the exact chunk size
while for dimensions with irregular chunks this is the average chunks size. Useful for downstream applications that want to
distribute computations and want to know about chunk sizes.
"""
approx_chunksize(g::GridChunks) = approx_chunksize.(g.chunks)
"""
grid_offset(g::GridChunks)
Returns the offset of the grid for the first chunks. Expect this value to be non-zero for views into regular-gridded
arrays. Useful for downstream applications that want to distribute computations and want to know about chunk sizes.
"""
grid_offset(g::GridChunks) = grid_offset.(g.chunks)
"""
max_chunksize(g::GridChunks)
Returns the maximum chunk size of an array for each dimension. Useful for pre-allocating arrays to make sure they can hold
a chunk of data.
"""
max_chunksize(g::GridChunks) = max_chunksize.(g.chunks)
# Define the approx default maximum chunk size (in MB)
"The target chunk size for processing for unchunked arrays in MB, defaults to 100MB"
const default_chunk_size = Ref(100)
"""
A fallback element size for arrays to determine a where elements have unknown
size like strings. Defaults to 100MB
"""
const fallback_element_size = Ref(100)
# Here we implement a fallback chunking for a DiskArray although this should normally
# be over-ridden by the package that implements the interface
function eachchunk(a::AbstractArray)
return estimate_chunksize(a)
end
# Chunked trait
struct Chunked end
struct Unchunked end
function haschunks end
haschunks(x) = Unchunked()
"""
element_size(a::AbstractArray)
Returns the approximate size of an element of a in bytes. This falls back to calling `sizeof` on
the element type or to the value stored in `DiskArrays.fallback_element_size`. Methods can be added for
custom containers.
"""
function element_size(a::AbstractArray)
if isbitstype(eltype(a))
return sizeof(eltype(a))
elseif isbitstype(Base.nonmissingtype(eltype(a)))
return sizeof(Base.nonmissingtype(eltype(a)))
else
@warn "Can not determine size of element type. Using DiskArrays.fallback_element_size[] = $(fallback_element_size[]) bytes"
return fallback_element_size[]
end
end
estimate_chunksize(a::AbstractArray) = estimate_chunksize(size(a), element_size(a))
function estimate_chunksize(s, si)
ii = searchsortedfirst(cumprod(collect(s)), default_chunk_size[] * 1e6 / si)
cs = ntuple(length(s)) do idim
if idim < ii
return s[idim]
elseif idim > ii
return 1
else
sbefore = idim == 1 ? 1 : prod(s[1:(idim - 1)])
return floor(Int, default_chunk_size[] * 1e6 / si / sbefore)
end
end
return GridChunks(s, cs)
end