ENH: sparse: canonicalize on initialization

Duplicates are allowed in the constructor, but not handled anywhere else. See scipygh-5807 for the initial discussion.
perimosocordiae · Sep 30, 2016 · 86e0c0b · 86e0c0b
1 parent bc0d766
commit 86e0c0b
Show file tree

Hide file tree

Showing 13 changed files with 192 additions and 327 deletions.
diff --git a/scipy/io/harwell_boeing/hb.py b/scipy/io/harwell_boeing/hb.py
@@ -327,7 +327,8 @@ def _read_hb_data(content, header):
 
     try:
         return csc_matrix((val, ind-1, ptr-1),
-                          shape=(header.nrows, header.ncols))
+                          shape=(header.nrows, header.ncols),
+                          copy=False, canonicalize=False)
     except ValueError as e:
         raise e
 

diff --git a/scipy/sparse/bsr.py b/scipy/sparse/bsr.py
@@ -117,8 +117,10 @@ class bsr_matrix(_cs_matrix, _minmax_mixin):
     """
     format = 'bsr'
 
-    def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
+    def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None,
+                 canonicalize=True):
         _data_matrix.__init__(self)
+        needs_sum_duplicates = False
 
         if isspmatrix(arg1):
             if isspmatrix_bsr(arg1) and copy:
@@ -154,7 +156,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
             elif len(arg1) == 2:
                 # (data,(row,col)) format
                 from .coo import coo_matrix
-                self._set_self(coo_matrix(arg1, dtype=dtype).tobsr(blocksize=blocksize))
+                coo = coo_matrix(arg1, dtype=dtype, copy=copy,
+                                 canonicalize=canonicalize)
+                self._set_self(coo.tobsr(blocksize=blocksize))
 
             elif len(arg1) == 3:
                 # (data,indices,indptr) format
@@ -171,7 +175,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
 
                 self.indices = np.array(indices, copy=copy, dtype=idx_dtype)
                 self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
-                self.data = np.array(data, copy=copy, dtype=getdtype(dtype, data))
+                self.data = np.array(data, copy=copy,
+                                     dtype=getdtype(dtype, data))
+                needs_sum_duplicates = bool(canonicalize)
             else:
                 raise ValueError('unrecognized bsr_matrix constructor usage')
         else:
@@ -211,6 +217,32 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False, blocksize=None):
 
         self.check_format(full_check=False)
 
+        if needs_sum_duplicates:
+            self.sort_indices()
+            R, C = self.blocksize
+            M, N = self.shape
+
+            # port of _sparsetools.csr_sum_duplicates
+            n_row = M // R
+            nnz = 0
+            row_end = 0
+            for i in range(n_row):
+                jj = row_end
+                row_end = self.indptr[i+1]
+                while jj < row_end:
+                    j = self.indices[jj]
+                    x = self.data[jj]
+                    jj += 1
+                    while jj < row_end and self.indices[jj] == j:
+                        x += self.data[jj]
+                        jj += 1
+                    self.indices[nnz] = j
+                    self.data[nnz] = x
+                    nnz += 1
+                self.indptr[i+1] = nnz
+
+            self.prune()  # nnz may have changed
+
     def check_format(self, full_check=True):
         """check whether the matrix format is valid
 
@@ -407,7 +439,8 @@ def _mul_sparse_matrix(self, other):
 
         # TODO eliminate zeros
 
-        return bsr_matrix((data,indices,indptr),shape=(M,N),blocksize=(R,C))
+        return bsr_matrix((data,indices,indptr),shape=(M,N),blocksize=(R,C),
+                          copy=False, canonicalize=False)
 
     ######################
     # Conversion methods #
@@ -473,7 +506,8 @@ def tocoo(self, copy=True):
             data = data.copy()
 
         from .coo import coo_matrix
-        return coo_matrix((data,(row,col)), shape=self.shape)
+        return coo_matrix((data,(row,col)), shape=self.shape, copy=False,
+                          canonicalize=False)
 
     def transpose(self, axes=None, copy=False):
         if axes is not None:
@@ -487,7 +521,7 @@ def transpose(self, axes=None, copy=False):
 
         if self.nnz == 0:
             return bsr_matrix((N, M), blocksize=(C, R),
-                              dtype=self.dtype, copy=copy)
+                              dtype=self.dtype, copy=copy, canonicalize=False)
 
         indptr = np.empty(N//C + 1, dtype=self.indptr.dtype)
         indices = np.empty(NBLK, dtype=self.indices.dtype)
@@ -498,7 +532,7 @@ def transpose(self, axes=None, copy=False):
                       indptr, indices, data.ravel())
 
         return bsr_matrix((data, indices, indptr),
-                          shape=(N, M), copy=copy)
+                          shape=(N, M), copy=copy, canonicalize=False)
 
     transpose.__doc__ = spmatrix.transpose.__doc__
 
@@ -524,39 +558,6 @@ def eliminate_zeros(self):
                                          self.indices, mask)
         self.prune()
 
-    def sum_duplicates(self):
-        """Eliminate duplicate matrix entries by adding them together
-
-        The is an *in place* operation
-        """
-        if self.has_canonical_format:
-            return
-        self.sort_indices()
-        R, C = self.blocksize
-        M, N = self.shape
-
-        # port of _sparsetools.csr_sum_duplicates
-        n_row = M // R
-        nnz = 0
-        row_end = 0
-        for i in range(n_row):
-            jj = row_end
-            row_end = self.indptr[i+1]
-            while jj < row_end:
-                j = self.indices[jj]
-                x = self.data[jj]
-                jj += 1
-                while jj < row_end and self.indices[jj] == j:
-                    x += self.data[jj]
-                    jj += 1
-                self.indices[nnz] = j
-                self.data[nnz] = x
-                nnz += 1
-            self.indptr[i+1] = nnz
-
-        self.prune()  # nnz may have changed
-        self.has_canonical_format = True
-
     def sort_indices(self):
         """Sort the indices of this matrix *in place*
         """
@@ -646,11 +647,11 @@ def _with_data(self,data,copy=True):
         (i.e. .indptr and .indices) are copied.
         """
         if copy:
-            return self.__class__((data,self.indices.copy(),self.indptr.copy()),
-                                   shape=self.shape,dtype=data.dtype)
+            arg1 = (data, self.indices.copy(), self.indptr.copy())
         else:
-            return self.__class__((data,self.indices,self.indptr),
-                                   shape=self.shape,dtype=data.dtype)
+            arg1 = (data, self.indices, self.indptr)
+        return self.__class__(arg1, shape=self.shape, dtype=data.dtype,
+                              copy=False, canonicalize=False)
 
 #    # these functions are used by the parent class
 #    # to remove redudancy between bsc_matrix and bsr_matrix

diff --git a/scipy/sparse/compressed.py b/scipy/sparse/compressed.py
@@ -21,8 +21,10 @@
 class _cs_matrix(_data_matrix, _minmax_mixin, IndexMixin):
     """base matrix class for compressed row and column oriented matrices"""
 
-    def __init__(self, arg1, shape=None, dtype=None, copy=False):
+    def __init__(self, arg1, shape=None, dtype=None, copy=False,
+                 canonicalize=True):
         _data_matrix.__init__(self)
+        needs_sum_duplicates = False
 
         if isspmatrix(arg1):
             if arg1.format == self.format and copy:
@@ -47,8 +49,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
                 if len(arg1) == 2:
                     # (data, ij) format
                     from .coo import coo_matrix
-                    other = self.__class__(coo_matrix(arg1, shape=shape))
-                    self._set_self(other)
+                    coo = coo_matrix(arg1, shape=shape, copy=copy,
+                                     canonicalize=canonicalize)
+                    self._set_self(self.__class__(coo))
                 elif len(arg1) == 3:
                     # (data, indices, indptr) format
                     (data, indices, indptr) = arg1
@@ -63,6 +66,7 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
                     self.indices = np.array(indices, copy=copy, dtype=idx_dtype)
                     self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
                     self.data = np.array(data, copy=copy, dtype=dtype)
+                    needs_sum_duplicates = bool(canonicalize)
                 else:
                     raise ValueError("unrecognized %s_matrix constructor usage" %
                             self.format)
@@ -96,6 +100,13 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
 
         self.check_format(full_check=False)
 
+        if needs_sum_duplicates:
+            self.sort_indices()
+            M, N = self._swap(self.shape)
+            _sparsetools.csr_sum_duplicates(M, N, self.indptr, self.indices,
+                                            self.data)
+            self.prune()
+
     def getnnz(self, axis=None):
         if axis is None:
             return int(self.indptr[-1])
@@ -187,7 +198,6 @@ def check_format(self, full_check=True):
         #    warn('Indices were not in sorted order.  Sorting indices.')
         #    self.sort_indices()
         #    assert(self.has_sorted_indices())
-        # TODO check for duplicates?
 
     #######################
     # Boolean comparisons #
@@ -197,8 +207,8 @@ def _scalar_binopt(self, other, op):
         """Scalar version of self._binopt, for cases in which no new nonzeros
         are added. Produces a new spmatrix in canonical form.
         """
-        self.sum_duplicates()
         res = self._with_data(op(self.data, other), copy=True)
+        res.sort_indices()
         res.eliminate_zeros()
         return res
 
@@ -505,7 +515,8 @@ def _mul_sparse_matrix(self, other):
            other.data,
            indptr, indices, data)
 
-        return self.__class__((data,indices,indptr),shape=(M,N))
+        return self.__class__((data,indices,indptr), shape=(M,N), copy=False,
+                              canonicalize=False)
 
     def diagonal(self):
         """Returns the main diagonal of the matrix
@@ -531,10 +542,10 @@ def _maximum_minimum(self, other, npop, op_name, dense_check):
                 other_arr = self.__class__(other_arr)
                 return self._binopt(other_arr, op_name)
             else:
-                self.sum_duplicates()
                 new_data = npop(self.data, np.asarray(other))
                 mat = self.__class__((new_data, self.indices, self.indptr),
-                                     dtype=new_data.dtype, shape=self.shape)
+                                     dtype=new_data.dtype, shape=self.shape,
+                                     copy=False, canonicalize=False)
                 return mat
         elif isdense(other):
             return npop(self.todense(), other)
@@ -585,8 +596,6 @@ def sum(self, axis=None, dtype=None, out=None):
     def _minor_reduce(self, ufunc):
         """Reduce nonzeros with a ufunc over the minor axis when non-empty
 
-        Warning: this does not call sum_duplicates()
-
         Returns
         -------
         major_index : array of ints
@@ -714,7 +723,7 @@ def _set_many(self, i, j, x):
                                               n_samples, i, j, offsets)
         if ret == 1:
             # rinse and repeat
-            self.sum_duplicates()
+            self.sort_indices()
             _sparsetools.csr_sample_offsets(M, N, self.indptr,
                                             self.indices, n_samples, i, j,
                                             offsets)
@@ -752,7 +761,7 @@ def _zero_many(self, i, j):
                                               n_samples, i, j, offsets)
         if ret == 1:
             # rinse and repeat
-            self.sum_duplicates()
+            self.sort_indices()
             _sparsetools.csr_sample_offsets(M, N, self.indptr,
                                             self.indices, n_samples, i, j,
                                             offsets)
@@ -896,22 +905,23 @@ def _in_bounds(i0, i1, num):
         data, indices, indptr = aux[2], aux[1], aux[0]
         shape = self._swap((i1 - i0, j1 - j0))
 
-        return self.__class__((data, indices, indptr), shape=shape)
+        return self.__class__((data, indices, indptr), shape=shape, copy=False,
+                              canonicalize=False)
 
     ######################
     # Conversion methods #
     ######################
 
     def tocoo(self, copy=True):
         major_dim, minor_dim = self._swap(self.shape)
-        minor_indices = self.indices
+        minor_indices = np.array(self.indices, copy=copy)
         major_indices = np.empty(len(minor_indices), dtype=self.indices.dtype)
         _sparsetools.expandptr(major_dim, self.indptr, major_indices)
-        row, col = self._swap((major_indices, minor_indices))
+        row_col = self._swap((major_indices, minor_indices))
 
         from .coo import coo_matrix
-        return coo_matrix((self.data, (row, col)), self.shape, copy=copy,
-                          dtype=self.dtype)
+        return coo_matrix((self.data, row_col), self.shape, copy=copy,
+                          dtype=self.dtype, canonicalize=False)
 
     tocoo.__doc__ = spmatrix.tocoo.__doc__
 
@@ -933,51 +943,6 @@ def eliminate_zeros(self):
                                          self.data)
         self.prune()  # nnz may have changed
 
-    def __get_has_canonical_format(self):
-        """Determine whether the matrix has sorted indices and no duplicates
-
-        Returns
-            - True: if the above applies
-            - False: otherwise
-
-        has_canonical_format implies has_sorted_indices, so if the latter flag
-        is False, so will the former be; if the former is found True, the
-        latter flag is also set.
-        """
-
-        # first check to see if result was cached
-        if not getattr(self, '_has_sorted_indices', True):
-            # not sorted => not canonical
-            self._has_canonical_format = False
-        elif not hasattr(self, '_has_canonical_format'):
-            self.has_canonical_format = _sparsetools.csr_has_canonical_format(
-                len(self.indptr) - 1, self.indptr, self.indices)
-        return self._has_canonical_format
-
-    def __set_has_canonical_format(self, val):
-        self._has_canonical_format = bool(val)
-        if val:
-            self.has_sorted_indices = True
-
-    has_canonical_format = property(fget=__get_has_canonical_format,
-                                    fset=__set_has_canonical_format)
-
-    def sum_duplicates(self):
-        """Eliminate duplicate matrix entries by adding them together
-
-        The is an *in place* operation
-        """
-        if self.has_canonical_format:
-            return
-        self.sort_indices()
-
-        M, N = self._swap(self.shape)
-        _sparsetools.csr_sum_duplicates(M, N, self.indptr, self.indices,
-                                        self.data)
-
-        self.prune()  # nnz may have changed
-        self.has_canonical_format = True
-
     def __get_sorted(self):
         """Determine whether the matrix has sorted indices
 
@@ -1044,11 +1009,11 @@ def _with_data(self,data,copy=True):
         (i.e. .indptr and .indices) are copied.
         """
         if copy:
-            return self.__class__((data,self.indices.copy(),self.indptr.copy()),
-                                   shape=self.shape,dtype=data.dtype)
+            arg1 = (data, self.indices.copy(), self.indptr.copy())
         else:
-            return self.__class__((data,self.indices,self.indptr),
-                                   shape=self.shape,dtype=data.dtype)
+            arg1 = (data, self.indices, self.indptr)
+        return self.__class__(arg1, shape=self.shape, dtype=data.dtype,
+                              copy=False, canonicalize=False)
 
     def _binopt(self, other, op):
         """apply the binary operation fn to two sparse matrices."""
@@ -1087,9 +1052,8 @@ def _binopt(self, other, op):
             indices = indices.copy()
             data = data.copy()
 
-        A = self.__class__((data, indices, indptr), shape=self.shape)
-
-        return A
+        return self.__class__((data, indices, indptr), shape=self.shape,
+                              copy=False, canonicalize=False)
 
     def _divide_sparse(self, other):
         """