Skip to content

Commit

Permalink
ENH: sparse: canonicalize on initialization
Browse files Browse the repository at this point in the history
Duplicates are allowed in the constructor,
but not handled anywhere else.
See scipygh-5807 for the initial discussion.
  • Loading branch information
perimosocordiae committed Sep 30, 2016
1 parent bc0d766 commit 86e0c0b
Show file tree
Hide file tree
Showing 13 changed files with 192 additions and 327 deletions.
3 changes: 2 additions & 1 deletion scipy/io/harwell_boeing/hb.py
Expand Up @@ -327,7 +327,8 @@ def _read_hb_data(content, header):

try:
return csc_matrix((val, ind-1, ptr-1),
shape=(header.nrows, header.ncols))
shape=(header.nrows, header.ncols),
copy=False, canonicalize=False)
except ValueError as e:
raise e

Expand Down
89 changes: 45 additions & 44 deletions scipy/sparse/bsr.py
Expand Up @@ -117,8 +117,10 @@ class bsr_matrix(_cs_matrix, _minmax_mixin):
"""
format = 'bsr'

def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None,
canonicalize=True):
_data_matrix.__init__(self)
needs_sum_duplicates = False

if isspmatrix(arg1):
if isspmatrix_bsr(arg1) and copy:
Expand Down Expand Up @@ -154,7 +156,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
elif len(arg1) == 2:
# (data,(row,col)) format
from .coo import coo_matrix
self._set_self(coo_matrix(arg1, dtype=dtype).tobsr(blocksize=blocksize))
coo = coo_matrix(arg1, dtype=dtype, copy=copy,
canonicalize=canonicalize)
self._set_self(coo.tobsr(blocksize=blocksize))

elif len(arg1) == 3:
# (data,indices,indptr) format
Expand All @@ -171,7 +175,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):

self.indices = np.array(indices, copy=copy, dtype=idx_dtype)
self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
self.data = np.array(data, copy=copy, dtype=getdtype(dtype, data))
self.data = np.array(data, copy=copy,
dtype=getdtype(dtype, data))
needs_sum_duplicates = bool(canonicalize)
else:
raise ValueError('unrecognized bsr_matrix constructor usage')
else:
Expand Down Expand Up @@ -211,6 +217,32 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):

self.check_format(full_check=False)

if needs_sum_duplicates:
self.sort_indices()
R, C = self.blocksize
M, N = self.shape

# port of _sparsetools.csr_sum_duplicates
n_row = M // R
nnz = 0
row_end = 0
for i in range(n_row):
jj = row_end
row_end = self.indptr[i+1]
while jj < row_end:
j = self.indices[jj]
x = self.data[jj]
jj += 1
while jj < row_end and self.indices[jj] == j:
x += self.data[jj]
jj += 1
self.indices[nnz] = j
self.data[nnz] = x
nnz += 1
self.indptr[i+1] = nnz

self.prune() # nnz may have changed

def check_format(self, full_check=True):
"""check whether the matrix format is valid
Expand Down Expand Up @@ -407,7 +439,8 @@ def _mul_sparse_matrix(self, other):

# TODO eliminate zeros

return bsr_matrix((data,indices,indptr),shape=(M,N),blocksize=(R,C))
return bsr_matrix((data,indices,indptr),shape=(M,N),blocksize=(R,C),
copy=False, canonicalize=False)

######################
# Conversion methods #
Expand Down Expand Up @@ -473,7 +506,8 @@ def tocoo(self, copy=True):
data = data.copy()

from .coo import coo_matrix
return coo_matrix((data,(row,col)), shape=self.shape)
return coo_matrix((data,(row,col)), shape=self.shape, copy=False,
canonicalize=False)

def transpose(self, axes=None, copy=False):
if axes is not None:
Expand All @@ -487,7 +521,7 @@ def transpose(self, axes=None, copy=False):

if self.nnz == 0:
return bsr_matrix((N, M), blocksize=(C, R),
dtype=self.dtype, copy=copy)
dtype=self.dtype, copy=copy, canonicalize=False)

indptr = np.empty(N//C + 1, dtype=self.indptr.dtype)
indices = np.empty(NBLK, dtype=self.indices.dtype)
Expand All @@ -498,7 +532,7 @@ def transpose(self, axes=None, copy=False):
indptr, indices, data.ravel())

return bsr_matrix((data, indices, indptr),
shape=(N, M), copy=copy)
shape=(N, M), copy=copy, canonicalize=False)

transpose.__doc__ = spmatrix.transpose.__doc__

Expand All @@ -524,39 +558,6 @@ def eliminate_zeros(self):
self.indices, mask)
self.prune()

def sum_duplicates(self):
"""Eliminate duplicate matrix entries by adding them together
The is an *in place* operation
"""
if self.has_canonical_format:
return
self.sort_indices()
R, C = self.blocksize
M, N = self.shape

# port of _sparsetools.csr_sum_duplicates
n_row = M // R
nnz = 0
row_end = 0
for i in range(n_row):
jj = row_end
row_end = self.indptr[i+1]
while jj < row_end:
j = self.indices[jj]
x = self.data[jj]
jj += 1
while jj < row_end and self.indices[jj] == j:
x += self.data[jj]
jj += 1
self.indices[nnz] = j
self.data[nnz] = x
nnz += 1
self.indptr[i+1] = nnz

self.prune() # nnz may have changed
self.has_canonical_format = True

def sort_indices(self):
"""Sort the indices of this matrix *in place*
"""
Expand Down Expand Up @@ -646,11 +647,11 @@ def _with_data(self,data,copy=True):
(i.e. .indptr and .indices) are copied.
"""
if copy:
return self.__class__((data,self.indices.copy(),self.indptr.copy()),
shape=self.shape,dtype=data.dtype)
arg1 = (data, self.indices.copy(), self.indptr.copy())
else:
return self.__class__((data,self.indices,self.indptr),
shape=self.shape,dtype=data.dtype)
arg1 = (data, self.indices, self.indptr)
return self.__class__(arg1, shape=self.shape, dtype=data.dtype,
copy=False, canonicalize=False)

# # these functions are used by the parent class
# # to remove redudancy between bsc_matrix and bsr_matrix
Expand Down
102 changes: 33 additions & 69 deletions scipy/sparse/compressed.py
Expand Up @@ -21,8 +21,10 @@
class _cs_matrix(_data_matrix, _minmax_mixin, IndexMixin):
"""base matrix class for compressed row and column oriented matrices"""

def __init__(self, arg1, shape=None, dtype=None, copy=False):
def __init__(self, arg1, shape=None, dtype=None, copy=False,
canonicalize=True):
_data_matrix.__init__(self)
needs_sum_duplicates = False

if isspmatrix(arg1):
if arg1.format == self.format and copy:
Expand All @@ -47,8 +49,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
if len(arg1) == 2:
# (data, ij) format
from .coo import coo_matrix
other = self.__class__(coo_matrix(arg1, shape=shape))
self._set_self(other)
coo = coo_matrix(arg1, shape=shape, copy=copy,
canonicalize=canonicalize)
self._set_self(self.__class__(coo))
elif len(arg1) == 3:
# (data, indices, indptr) format
(data, indices, indptr) = arg1
Expand All @@ -63,6 +66,7 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
self.indices = np.array(indices, copy=copy, dtype=idx_dtype)
self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
self.data = np.array(data, copy=copy, dtype=dtype)
needs_sum_duplicates = bool(canonicalize)
else:
raise ValueError("unrecognized %s_matrix constructor usage" %
self.format)
Expand Down Expand Up @@ -96,6 +100,13 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):

self.check_format(full_check=False)

if needs_sum_duplicates:
self.sort_indices()
M, N = self._swap(self.shape)
_sparsetools.csr_sum_duplicates(M, N, self.indptr, self.indices,
self.data)
self.prune()

def getnnz(self, axis=None):
if axis is None:
return int(self.indptr[-1])
Expand Down Expand Up @@ -187,7 +198,6 @@ def check_format(self, full_check=True):
# warn('Indices were not in sorted order. Sorting indices.')
# self.sort_indices()
# assert(self.has_sorted_indices())
# TODO check for duplicates?

#######################
# Boolean comparisons #
Expand All @@ -197,8 +207,8 @@ def _scalar_binopt(self, other, op):
"""Scalar version of self._binopt, for cases in which no new nonzeros
are added. Produces a new spmatrix in canonical form.
"""
self.sum_duplicates()
res = self._with_data(op(self.data, other), copy=True)
res.sort_indices()
res.eliminate_zeros()
return res

Expand Down Expand Up @@ -505,7 +515,8 @@ def _mul_sparse_matrix(self, other):
other.data,
indptr, indices, data)

return self.__class__((data,indices,indptr),shape=(M,N))
return self.__class__((data,indices,indptr), shape=(M,N), copy=False,
canonicalize=False)

def diagonal(self):
"""Returns the main diagonal of the matrix
Expand All @@ -531,10 +542,10 @@ def _maximum_minimum(self, other, npop, op_name, dense_check):
other_arr = self.__class__(other_arr)
return self._binopt(other_arr, op_name)
else:
self.sum_duplicates()
new_data = npop(self.data, np.asarray(other))
mat = self.__class__((new_data, self.indices, self.indptr),
dtype=new_data.dtype, shape=self.shape)
dtype=new_data.dtype, shape=self.shape,
copy=False, canonicalize=False)
return mat
elif isdense(other):
return npop(self.todense(), other)
Expand Down Expand Up @@ -585,8 +596,6 @@ def sum(self, axis=None, dtype=None, out=None):
def _minor_reduce(self, ufunc):
"""Reduce nonzeros with a ufunc over the minor axis when non-empty
Warning: this does not call sum_duplicates()
Returns
-------
major_index : array of ints
Expand Down Expand Up @@ -714,7 +723,7 @@ def _set_many(self, i, j, x):
n_samples, i, j, offsets)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
self.sort_indices()
_sparsetools.csr_sample_offsets(M, N, self.indptr,
self.indices, n_samples, i, j,
offsets)
Expand Down Expand Up @@ -752,7 +761,7 @@ def _zero_many(self, i, j):
n_samples, i, j, offsets)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
self.sort_indices()
_sparsetools.csr_sample_offsets(M, N, self.indptr,
self.indices, n_samples, i, j,
offsets)
Expand Down Expand Up @@ -896,22 +905,23 @@ def _in_bounds(i0, i1, num):
data, indices, indptr = aux[2], aux[1], aux[0]
shape = self._swap((i1 - i0, j1 - j0))

return self.__class__((data, indices, indptr), shape=shape)
return self.__class__((data, indices, indptr), shape=shape, copy=False,
canonicalize=False)

######################
# Conversion methods #
######################

def tocoo(self, copy=True):
major_dim, minor_dim = self._swap(self.shape)
minor_indices = self.indices
minor_indices = np.array(self.indices, copy=copy)
major_indices = np.empty(len(minor_indices), dtype=self.indices.dtype)
_sparsetools.expandptr(major_dim, self.indptr, major_indices)
row, col = self._swap((major_indices, minor_indices))
row_col = self._swap((major_indices, minor_indices))

from .coo import coo_matrix
return coo_matrix((self.data, (row, col)), self.shape, copy=copy,
dtype=self.dtype)
return coo_matrix((self.data, row_col), self.shape, copy=copy,
dtype=self.dtype, canonicalize=False)

tocoo.__doc__ = spmatrix.tocoo.__doc__

Expand All @@ -933,51 +943,6 @@ def eliminate_zeros(self):
self.data)
self.prune() # nnz may have changed

def __get_has_canonical_format(self):
"""Determine whether the matrix has sorted indices and no duplicates
Returns
- True: if the above applies
- False: otherwise
has_canonical_format implies has_sorted_indices, so if the latter flag
is False, so will the former be; if the former is found True, the
latter flag is also set.
"""

# first check to see if result was cached
if not getattr(self, '_has_sorted_indices', True):
# not sorted => not canonical
self._has_canonical_format = False
elif not hasattr(self, '_has_canonical_format'):
self.has_canonical_format = _sparsetools.csr_has_canonical_format(
len(self.indptr) - 1, self.indptr, self.indices)
return self._has_canonical_format

def __set_has_canonical_format(self, val):
self._has_canonical_format = bool(val)
if val:
self.has_sorted_indices = True

has_canonical_format = property(fget=__get_has_canonical_format,
fset=__set_has_canonical_format)

def sum_duplicates(self):
"""Eliminate duplicate matrix entries by adding them together
The is an *in place* operation
"""
if self.has_canonical_format:
return
self.sort_indices()

M, N = self._swap(self.shape)
_sparsetools.csr_sum_duplicates(M, N, self.indptr, self.indices,
self.data)

self.prune() # nnz may have changed
self.has_canonical_format = True

def __get_sorted(self):
"""Determine whether the matrix has sorted indices
Expand Down Expand Up @@ -1044,11 +1009,11 @@ def _with_data(self,data,copy=True):
(i.e. .indptr and .indices) are copied.
"""
if copy:
return self.__class__((data,self.indices.copy(),self.indptr.copy()),
shape=self.shape,dtype=data.dtype)
arg1 = (data, self.indices.copy(), self.indptr.copy())
else:
return self.__class__((data,self.indices,self.indptr),
shape=self.shape,dtype=data.dtype)
arg1 = (data, self.indices, self.indptr)
return self.__class__(arg1, shape=self.shape, dtype=data.dtype,
copy=False, canonicalize=False)

def _binopt(self, other, op):
"""apply the binary operation fn to two sparse matrices."""
Expand Down Expand Up @@ -1087,9 +1052,8 @@ def _binopt(self, other, op):
indices = indices.copy()
data = data.copy()

A = self.__class__((data, indices, indptr), shape=self.shape)

return A
return self.__class__((data, indices, indptr), shape=self.shape,
copy=False, canonicalize=False)

def _divide_sparse(self, other):
"""
Expand Down

0 comments on commit 86e0c0b

Please sign in to comment.