/
sparse.py
264 lines (217 loc) · 9.42 KB
/
sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# Copyright (c) 2014 Evalf
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""
The sparse module defines a dtype for numpy that represents sparse data in
n-dimensional coo-format. That is, every array element contains an index into a
larger sparse object, and a value, which can be of any numpy supported data
type including integers, floating point values and complex data. Additionally,
the dtype carries the shape of the sparse object as metadata, which makes the
numpy array into an entirely self contained sparse object.
In addition to the dtype, the sparse module provides a range of methods for
manipulation of sparse data, such as deduplication of indices, pruning of
zeros, sparse addition, and conversion to other sparse or dense data formats.
"""
import numpy
chunksize = 0x10000000 # 256MB
def dtype(shape, vtype=numpy.float64):
'''Numpy data dtype for sparse data.
Returns a structured dtype with fields 'index' and 'value', where index is
again structured with fields 'i0', 'i1', etc, and value is of type ``vtype``.
The indices are of the smallest unsigned integer type that can encode all
indices within ``shape``, and carry the shape as metadata.
Args
----
shape : :class:`tuple` of integers.
Shape of the sparse object.
vtype : :class:`numpy.dtype` or :class:`str`
Data dype of the sparse object (i.e. the nonzero values).
Returns
-------
dtype : :class:`numpy.dtype`
The sparse dtype.
'''
return _dtype([((int(n), 'i'+str(i)), '>u'+str(1 if n <= 256 else 2 if n <= 256**2 else 4 if n <= 256**4 else 8)) for i, n in enumerate(shape)], vtype)
def issparse(data):
return isinstance(data, numpy.ndarray) and issparsedtype(data.dtype)
def issparsedtype(dtype):
return dtype.names == ('index', 'value') and all(
len(value) == 3 and isinstance(value[2], int) and 0 <= value[2] < 256**value[0].itemsize
for value in dtype['index'].fields.values())
def ndim(data):
'''Dimension of the sparse object.'''
return len(data.dtype['index'].names)
def shape(data):
'''Shape of the sparse object.'''
itype = data.dtype['index']
return tuple(itype.fields[name][2] for name in itype.names)
def indices(data):
'''Tuple of indices of the nonzero values of the sparse object.'''
index = data['index']
return tuple(index[i] for i in index.dtype.names)
def values(data):
'''Nonzero values of the sparse object.'''
return data['value']
def extract(data):
'''Tuple of indices, values, and shape of the sparse object.'''
return indices(data), values(data), shape(data)
def empty(shape, vtype=numpy.float64):
'''Completely sparse array of given shape and data type.'''
return numpy.empty(0, dtype=dtype(shape, vtype))
def result_type(dtype0, *dtypes):
'''Sparse analogue of :func:`numpy.result_type`.'''
if not dtypes:
return dtype0
if any(dtype['index'] != dtype0['index'] for dtype in dtypes):
raise Exception('non-matching shapes')
return _dtype(dtype0['index'], numpy.result_type(dtype0['value'], *[dtype['value'] for dtype in dtypes]))
def dedup(data, inplace=False):
'''Deduplicate indices.
Dedup sorts data in lexicographical order and sums all values with matching
indices such that the returned array has at most one value per sparse index.
The sorting happens in place, which means that ``dedup`` changes the order of
the input argument. Additionally, if ``inplace`` is true, the deduplication
step reuses the input array's memory. This may affect the size of the array,
which should no longer be used after deduplication in place. In case the
input has no duplicates the input array is returned.
>>> from nutils.sparse import dtype, dedup
>>> from numpy import array
>>> A = array([((0,1),.1), ((1,0),.2), ((0,1),.3)], dtype=dtype([2,2]))
>>> dedup(A)
array([((0, 1), 0.4), ((1, 0), 0.2)],
dtype=[('index', [((2, 'i0'), 'u1'), ((2, 'i1'), 'u1')]), ('value', '<f8')])
'''
if not len(data):
return data
if not ndim(data):
return data['value'].sum()[numpy.newaxis].view(data.dtype)
data.view(numpy.void).sort(kind='stable') # stable = timsort
keep = data['index'][1:] != data['index'][:-1]
if keep.all():
return data
elif inplace:
buf = numpy.empty(chunksize // data.dtype.itemsize or 1, dtype=data.dtype)
n, = numpy.hstack([True, keep, True]).nonzero()
for i in range(0, len(n)-1, len(buf)):
s = numpy.diff(n[i:i+len(buf)+1])
overlap = i+len(s) > n[i]
chunk = buf[:len(s)] if overlap else data[i:i+len(s)]
numpy.take(data['index'], n[i:i+len(s)], out=chunk['index'])
chunk['value'].fill(0)
numpy.add.at(chunk['value'], numpy.arange(len(s)).repeat(s), data['value'][n[i]:n[i+len(s)]])
if overlap:
data[i:i+len(s)] = chunk
return _resize(data, len(n)-1)
else:
offsets = keep.cumsum()
dedup = numpy.empty(offsets[-1]+1, dtype=data.dtype)
dedup[0] = data[0]
numpy.compress(keep, data['index'][1:], out=dedup['index'][1:])
dedup['value'][1:].fill(0)
numpy.add.at(dedup['value'], offsets, data['value'][1:])
return dedup
def prune(data, inplace=False):
'''Prune zero values.
Prune returns a sparse object with all zero values removed. If ``inplace`` is
true the returned object reuses the input array's memory. This may affect the
size of the array, which should no longer be used after pruning in place. In
case the input has no zeros the input array is returned.
>>> from nutils.sparse import dtype, prune
>>> from numpy import array
>>> A = array([((0,1),.1), ((1,0),0), ((0,1),.3)], dtype=dtype([2,2]))
>>> prune(A)
array([((0, 1), 0.1), ((0, 1), 0.3)],
dtype=[('index', [((2, 'i0'), 'u1'), ((2, 'i1'), 'u1')]), ('value', '<f8')])
'''
if data['value'].all():
return data
elif inplace:
buf = numpy.empty(chunksize // data.dtype.itemsize or 1, dtype=data.dtype)
nz, = data['value'].nonzero()
for i in range(0, len(nz), len(buf)):
s = nz[i:i+len(buf)]
overlap = i+len(s) > s[0]
chunk = buf[:len(s)] if overlap else data[i:i+len(s)]
numpy.take(data, s, out=chunk)
if overlap:
data[i:i+len(s)] = chunk
return _resize(data, len(nz))
else:
return numpy.compress(data['value'], data)
def add(datas):
'''Add sparse objects.
Returns the sum of a list of sparse objects by concatenating the sparse
entries. The returned array is of the data type mandated by Numpy's promotion
rules. In case ``datas`` contains only one item of nonzero length and this
item has the correct data type, then this array is returned as-is.
>>> from nutils.sparse import dtype, add
>>> from numpy import array
>>> A = array([((0,1),.1), ((1,0),.2)], dtype=dtype([2,2]))
>>> B = array([((0,1),.3)], dtype=dtype([2,2]))
>>> add([A, B])
array([((0, 1), 0.1), ((1, 0), 0.2), ((0, 1), 0.3)],
dtype=[('index', [((2, 'i0'), 'u1'), ((2, 'i1'), 'u1')]), ('value', '<f8')])
'''
dtype = result_type(*[data.dtype for data in datas])
datas = [data for data in datas if len(data)]
if len(datas) == 1 and datas[0].dtype == dtype:
return datas[0]
retval = numpy.empty(sum(map(len, datas)), dtype=dtype)
if datas:
numpy.concatenate(datas, out=retval)
return retval
def toarray(data):
'''Convert sparse object to a dense array.
>>> from nutils.sparse import dtype, toarray
>>> from numpy import array
>>> A = array([((0,1),.1), ((1,0),.2), ((0,1),.3)], dtype=dtype([2,2]))
>>> toarray(A)
array([[ 0. , 0.4],
[ 0.2, 0. ]])
'''
indices, values, shape = extract(data)
if not shape:
return values.sum()
retval = numpy.zeros(shape, values.dtype)
numpy.add.at(retval, indices, values)
return retval
def fromarray(data):
'''Convert dense array to sparse object.
>>> from nutils.sparse import dtype, fromarray
>>> from numpy import array
>>> A = array([[0, .4], [.2, 0]])
>>> fromarray(A)
array([((0, 0), 0. ), ((0, 1), 0.4), ((1, 0), 0.2), ((1, 1), 0. )],
dtype=[('index', [((2, 'i0'), 'u1'), ((2, 'i1'), 'u1')]), ('value', '<f8')])
'''
retval = numpy.empty(data.size, dtype=dtype(data.shape, data.dtype))
retval.reshape(data.shape)['value'] = data
index = retval.reshape(data.shape)['index']
for i, sh in enumerate(data.shape):
index['i'+str(i)] = numpy.arange(sh).reshape([-1]+[1]*(data.ndim-i-1))
return retval
# internal methods
def _dtype(itype, vtype):
return numpy.dtype([('index', itype), ('value', vtype)])
def _resize(data, n):
if data.base is not None:
return data[:n]
data.resize(n, refcheck=False)
return data
# vim:sw=2:sts=2:et