ENH: Sparse dtypes (#13849)

Add better support for int64 and bool data types in sparse objects
pandas-dev · Aug 31, 2016 · b6d3a81 · b6d3a81
1 parent 0c1e052
commit b6d3a81
Show file tree

Hide file tree

Showing 18 changed files with 696 additions and 176 deletions.
diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
@@ -132,6 +132,61 @@ keeps an arrays of all of the locations where the data are not equal to the
 fill value. The ``block`` format tracks only the locations and sizes of blocks
 of data.
 
+.. _sparse.dtype:
+
+Sparse Dtypes
+-------------
+
+Sparse data should have the same dtype as its dense representation. Currently,
+``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original
+dtype, ``fill_value`` default changes:
+
+- ``float64``: ``np.nan``
+- ``int64``: ``0``
+- ``bool``: ``False``
+
+.. ipython:: python
+
+   s = pd.Series([1, np.nan, np.nan])
+   s
+   s.to_sparse()
+
+   s = pd.Series([1, 0, 0])
+   s
+   s.to_sparse()
+
+   s = pd.Series([True, False, True])
+   s
+   s.to_sparse()
+
+You can change the dtype using ``.astype()``, the result is also sparse. Note that
+``.astype()`` also affects to the ``fill_value`` to keep its dense represantation.
+
+
+.. ipython:: python
+
+   s = pd.Series([1, 0, 0, 0, 0])
+   s
+   ss = s.to_sparse()
+   ss
+   ss.astype(np.float64)
+
+It raises if any value cannot be coerced to specified dtype.
+
+.. code-block:: ipython
+
+   In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse()
+   0    1.0
+   1    NaN
+   2    NaN
+   dtype: float64
+   BlockIndex
+   Block locations: array([0], dtype=int32)
+   Block lengths: array([1], dtype=int32)
+
+   In [2]: ss.astype(np.int64)
+   ValueError: unable to coerce current fill_value nan to int64 dtype
+
 .. _sparse.calculation:
 
 Sparse Calculation

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -17,6 +17,7 @@ Highlights include:
 - ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
 - pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
 - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See ref:`here <whatsnew_0190.api.period>`
+- Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here <whatsnew_0190.sparse>`
 
 .. contents:: What's new in v0.19.0
     :local:
@@ -975,6 +976,51 @@ Sparse Changes
 
 These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling.
 
+
+``int64`` and ``bool`` support enhancements
+"""""""""""""""""""""""""""""""""""""""""""
+
+Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`)
+
+Previously, sparse data were ``float64`` dtype by default, even if all inputs were ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly becuase it's default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data.
+
+.. code-block:: ipython
+
+   In [1]: pd.SparseArray([1, 2, 0, 0])
+   Out[1]:
+   [1.0, 2.0, 0.0, 0.0]
+   Fill: nan
+   IntIndex
+   Indices: array([0, 1, 2, 3], dtype=int32)
+
+   # specifying int64 dtype, but all values are stored in sp_values because
+   # fill_value default is np.nan
+   In [2]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64)
+   Out[2]:
+   [1, 2, 0, 0]
+   Fill: nan
+   IntIndex
+   Indices: array([0, 1, 2, 3], dtype=int32)
+
+   In [3]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64, fill_value=0)
+   Out[3]:
+   [1, 2, 0, 0]
+   Fill: 0
+   IntIndex
+   Indices: array([0, 1], dtype=int32)
+
+As of v0.19.0, sparse data keeps the input dtype, and assign more appropriate ``fill_value`` default (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype).
+
+.. ipython :: python
+
+   pd.SparseArray([1, 2, 0, 0], dtype=np.int64)
+   pd.SparseArray([True, False, False, False])
+
+See the :ref:`docs <sparse.dtype>` for more details.
+
+Operators now preserve dtypes
+"""""""""""""""""""""""""""""
+
 - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`)
 
 .. ipython:: python
@@ -1001,6 +1047,9 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
    Out[7]:
    ValueError: unable to coerce current fill_value nan to int64 dtype
 
+Other sparse fixes
+""""""""""""""""""
+
 - Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
 - ``SparseArray`` with ``bool`` dtype now supports logical (bool) operators (:issue:`14000`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
@@ -1011,6 +1060,11 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
 - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`)
 - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
 - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
+- Bug in ``SparseSeries`` slicing changes integer dtype to float (:issue:`8292`)
+- Bug in ``SparseDataFarme`` comparison ops may raise ``TypeError`` (:issue:`13001`)
+- Bug in ``SparseDataFarme.isnull`` raises ``ValueError`` (:issue:`8276`)
+- Bug in ``SparseSeries`` representation with ``bool`` dtype may raise ``IndexError`` (:issue:`13110`)
+- Bug in ``SparseSeries`` and ``SparseDataFrame`` of ``bool`` or ``int64`` dtype may display its values like ``float64`` dtype (:issue:`13110`)
 - Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result  (:issue:`13985`)
 - Bug in ``SparseArray`` created from ``SparseSeries`` may lose ``dtype`` (:issue:`13999`)
 - Bug in ``SparseSeries`` comparison with dense returns normal ``Series`` rather than ``SparseSeries`` (:issue:`13999`)
@@ -1053,7 +1107,6 @@ New behaviour:
    In [2]: i.get_indexer(['b', 'b', 'c']).dtype
    Out[2]: dtype('int64')
 
-
 .. _whatsnew_0190.deprecations:
 
 Deprecations

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3779,24 +3779,29 @@ def asof(self, where, subset=None):
     # ----------------------------------------------------------------------
     # Action Methods
 
-    def isnull(self):
-        """
+    _shared_docs['isnull'] = """
         Return a boolean same-sized object indicating if the values are null.
 
         See Also
         --------
         notnull : boolean inverse of isnull
         """
+
+    @Appender(_shared_docs['isnull'])
+    def isnull(self):
         return isnull(self).__finalize__(self)
 
-    def notnull(self):
-        """Return a boolean same-sized object indicating if the values are
+    _shared_docs['isnotnull'] = """
+        Return a boolean same-sized object indicating if the values are
         not null.
 
         See Also
         --------
         isnull : boolean inverse of notnull
         """
+
+    @Appender(_shared_docs['isnotnull'])
+    def notnull(self):
         return notnull(self).__finalize__(self)
 
     def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2478,9 +2478,6 @@ def fill_value(self):
 
     @fill_value.setter
     def fill_value(self, v):
-        # we may need to upcast our fill to match our dtype
-        if issubclass(self.dtype.type, np.floating):
-            v = float(v)
         self.values.fill_value = v
 
     def to_dense(self):

diff --git a/pandas/formats/format.py b/pandas/formats/format.py
@@ -21,6 +21,7 @@
                                  is_numeric_dtype,
                                  is_datetime64_dtype,
                                  is_timedelta64_dtype)
+from pandas.types.generic import ABCSparseArray
 
 from pandas.core.base import PandasObject
 from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -1966,6 +1967,8 @@ def _format(x):
         vals = self.values
         if isinstance(vals, Index):
             vals = vals._values
+        elif isinstance(vals, ABCSparseArray):
+            vals = vals.values
 
         is_float_type = lib.map_infer(vals, is_float) & notnull(vals)
         leading_space = is_float_type.any()

diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py
@@ -163,6 +163,13 @@ def compare_index_period(self, result, expected, typ, version):
         tm.assert_equal(result.freqstr, 'M')
         tm.assert_index_equal(result.shift(2), expected.shift(2))
 
+    def compare_sp_frame_float(self, result, expected, typ, version):
+        if LooseVersion(version) <= '0.18.1':
+            tm.assert_sp_frame_equal(result, expected, exact_indices=False,
+                                     check_dtype=False)
+        else:
+            tm.assert_sp_frame_equal(result, expected)
+
     def read_pickles(self, version):
         if not is_platform_little_endian():
             raise nose.SkipTest("known failure on non-little endian")