From 341585aa966ca9bd6dad98c819bc818e7d471e88 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 15:00:35 +0900 Subject: [PATCH] ENH: Sparse dtypes --- doc/source/sparse.rst | 55 +++++++++++ doc/source/whatsnew/v0.19.0.txt | 55 ++++++++++- pandas/core/generic.py | 13 ++- pandas/core/internals.py | 3 - pandas/formats/format.py | 3 + pandas/io/tests/test_pickle.py | 7 ++ pandas/sparse/array.py | 115 +++++++++++----------- pandas/sparse/frame.py | 21 +++- pandas/sparse/series.py | 47 +++++---- pandas/sparse/tests/test_arithmetics.py | 59 +++++++++++ pandas/sparse/tests/test_array.py | 69 ++++++++----- pandas/sparse/tests/test_format.py | 58 ++++++++++- pandas/sparse/tests/test_frame.py | 125 ++++++++++++++++++++---- pandas/sparse/tests/test_indexing.py | 15 +++ pandas/sparse/tests/test_libsparse.py | 55 +++++++++++ pandas/sparse/tests/test_series.py | 102 ++++++++++++++----- pandas/tests/series/test_subclass.py | 47 ++++++--- pandas/util/testing.py | 23 +++-- 18 files changed, 696 insertions(+), 176 deletions(-) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index db9734edde482..b6c5c15bc9081 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -132,6 +132,61 @@ keeps an arrays of all of the locations where the data are not equal to the fill value. The ``block`` format tracks only the locations and sizes of blocks of data. +.. _sparse.dtype: + +Sparse Dtypes +------------- + +Sparse data should have the same dtype as its dense representation. Currently, +``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original +dtype, ``fill_value`` default changes: + +- ``float64``: ``np.nan`` +- ``int64``: ``0`` +- ``bool``: ``False`` + +.. ipython:: python + + s = pd.Series([1, np.nan, np.nan]) + s + s.to_sparse() + + s = pd.Series([1, 0, 0]) + s + s.to_sparse() + + s = pd.Series([True, False, True]) + s + s.to_sparse() + +You can change the dtype using ``.astype()``, the result is also sparse. Note that +``.astype()`` also affects to the ``fill_value`` to keep its dense represantation. + + +.. ipython:: python + + s = pd.Series([1, 0, 0, 0, 0]) + s + ss = s.to_sparse() + ss + ss.astype(np.float64) + +It raises if any value cannot be coerced to specified dtype. + +.. code-block:: ipython + + In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse() + 0 1.0 + 1 NaN + 2 NaN + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([1], dtype=int32) + + In [2]: ss.astype(np.int64) + ValueError: unable to coerce current fill_value nan to int64 dtype + .. _sparse.calculation: Sparse Calculation diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 45cdd23140487..5760de26fecd3 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -17,6 +17,7 @@ Highlights include: - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See ref:`here ` +- Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -975,6 +976,51 @@ Sparse Changes These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +``int64`` and ``bool`` support enhancements +""""""""""""""""""""""""""""""""""""""""""" + +Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`) + +Previously, sparse data were ``float64`` dtype by default, even if all inputs were ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly becuase it's default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data. + +.. code-block:: ipython + + In [1]: pd.SparseArray([1, 2, 0, 0]) + Out[1]: + [1.0, 2.0, 0.0, 0.0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + # specifying int64 dtype, but all values are stored in sp_values because + # fill_value default is np.nan + In [2]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + Out[2]: + [1, 2, 0, 0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + In [3]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64, fill_value=0) + Out[3]: + [1, 2, 0, 0] + Fill: 0 + IntIndex + Indices: array([0, 1], dtype=int32) + +As of v0.19.0, sparse data keeps the input dtype, and assign more appropriate ``fill_value`` default (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). + +.. ipython :: python + + pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + pd.SparseArray([True, False, False, False]) + +See the :ref:`docs ` for more details. + +Operators now preserve dtypes +""""""""""""""""""""""""""""" + - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) .. ipython:: python @@ -1001,6 +1047,9 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` Out[7]: ValueError: unable to coerce current fill_value nan to int64 dtype +Other sparse fixes +"""""""""""""""""" + - Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) - ``SparseArray`` with ``bool`` dtype now supports logical (bool) operators (:issue:`14000`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) @@ -1011,6 +1060,11 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) +- Bug in ``SparseSeries`` slicing changes integer dtype to float (:issue:`8292`) +- Bug in ``SparseDataFarme`` comparison ops may raise ``TypeError`` (:issue:`13001`) +- Bug in ``SparseDataFarme.isnull`` raises ``ValueError`` (:issue:`8276`) +- Bug in ``SparseSeries`` representation with ``bool`` dtype may raise ``IndexError`` (:issue:`13110`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` of ``bool`` or ``int64`` dtype may display its values like ``float64`` dtype (:issue:`13110`) - Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) - Bug in ``SparseArray`` created from ``SparseSeries`` may lose ``dtype`` (:issue:`13999`) - Bug in ``SparseSeries`` comparison with dense returns normal ``Series`` rather than ``SparseSeries`` (:issue:`13999`) @@ -1053,7 +1107,6 @@ New behaviour: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int64') - .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e295174771c4..2a6f00c65c7fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3779,24 +3779,29 @@ def asof(self, where, subset=None): # ---------------------------------------------------------------------- # Action Methods - def isnull(self): - """ + _shared_docs['isnull'] = """ Return a boolean same-sized object indicating if the values are null. See Also -------- notnull : boolean inverse of isnull """ + + @Appender(_shared_docs['isnull']) + def isnull(self): return isnull(self).__finalize__(self) - def notnull(self): - """Return a boolean same-sized object indicating if the values are + _shared_docs['isnotnull'] = """ + Return a boolean same-sized object indicating if the values are not null. See Also -------- isnull : boolean inverse of notnull """ + + @Appender(_shared_docs['isnotnull']) + def notnull(self): return notnull(self).__finalize__(self) def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e11fd4086347f..bb2d1a9d1b5d3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2478,9 +2478,6 @@ def fill_value(self): @fill_value.setter def fill_value(self, v): - # we may need to upcast our fill to match our dtype - if issubclass(self.dtype.type, np.floating): - v = float(v) self.values.fill_value = v def to_dense(self): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index b83e3c4e73fdb..cb8fb3a5d2e49 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -21,6 +21,7 @@ is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype) +from pandas.types.generic import ABCSparseArray from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index @@ -1966,6 +1967,8 @@ def _format(x): vals = self.values if isinstance(vals, Index): vals = vals._values + elif isinstance(vals, ABCSparseArray): + vals = vals.values is_float_type = lib.map_infer(vals, is_float) & notnull(vals) leading_space = is_float_type.any() diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 94885d90d3c4a..a49f50b1bcb9f 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -163,6 +163,13 @@ def compare_index_period(self, result, expected, typ, version): tm.assert_equal(result.freqstr, 'M') tm.assert_index_equal(result.shift(2), expected.shift(2)) + def compare_sp_frame_float(self, result, expected, typ, version): + if LooseVersion(version) <= '0.18.1': + tm.assert_sp_frame_equal(result, expected, exact_indices=False, + check_dtype=False) + else: + tm.assert_sp_frame_equal(result, expected) + def read_pickles(self, version): if not is_platform_little_endian(): raise nose.SkipTest("known failure on non-little endian") diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index ca9d5efe2fbe5..8420371d05e02 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -4,25 +4,25 @@ from __future__ import division # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import pandas as pd from pandas.core.base import PandasObject -from pandas import compat, lib +from pandas import compat from pandas.compat import range from pandas.compat.numpy import function as nv from pandas.types.generic import ABCSparseArray, ABCSparseSeries -from pandas.types.common import (is_float, is_integer, - is_integer_dtype, _ensure_platform_int, +from pandas.types.common import (_ensure_platform_int, + is_float, is_integer, + is_integer_dtype, is_bool_dtype, is_list_like, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, - _astype_nansafe) -from pandas.types.missing import isnull, notnull + _astype_nansafe, _find_common_type) +from pandas.types.missing import isnull, notnull, na_value_for_dtype from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib @@ -69,16 +69,6 @@ def wrapper(self, other): return wrapper -def _maybe_match_dtype(left, right): - if not hasattr(right, 'dtype'): - return left.dtype - elif left.dtype == right.dtype: - return getattr(left.dtype, '__name__', left.dtype) - else: - # ToDo: to be supported after GH 667 - raise NotImplementedError('dtypes must be identical') - - def _get_fill(arr): # coerce fill_value to arr dtype if possible # int64 SparseArray can have NaN as fill_value if there is no missing @@ -99,7 +89,15 @@ def _sparse_array_op(left, right, op, name, series=False): left = left.astype(np.float64) right = right.astype(np.float64) - dtype = _maybe_match_dtype(left, right) + # dtype used to find corresponding sparse method + if not is_dtype_equal(left.dtype, right.dtype): + dtype = _find_common_type([left.dtype, right.dtype]) + left = left.astype(dtype) + right = right.astype(dtype) + else: + dtype = left.dtype + + # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: @@ -147,11 +145,11 @@ def _sparse_array_op(left, right, op, name, series=False): def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): - # ToDo: We can remove this condition when removing - # SparseArray's dtype default when closing GH 667 dtype = np.bool - elif name == 'truediv': - dtype = np.float64 + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) return SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype) @@ -164,7 +162,8 @@ class SparseArray(PandasObject, np.ndarray): data : {array-like (1-D), Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -182,7 +181,7 @@ class SparseArray(PandasObject, np.ndarray): fill_value = None def __new__(cls, data, sparse_index=None, index=None, kind='integer', - fill_value=None, dtype=np.float64, copy=False): + fill_value=None, dtype=None, copy=False): if index is not None: if data is None: @@ -199,25 +198,18 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', if dtype is not None: dtype = np.dtype(dtype) - if is_sparse_array: - # temp, always inherit passed SparseArray dtype - # can be removed after GH 13849 - dtype = data.dtype - - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = nan if is_sparse_array: sparse_index = data.sp_index - values = np.asarray(data) + values = data.sp_values + fill_value = data.fill_value else: # array-like if sparse_index is None: - values, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + if dtype is not None: + data = np.asarray(data, dtype=dtype) + res = make_sparse(data, kind=kind, fill_value=fill_value) + values, sparse_index, fill_value = res else: values = _sanitize_values(data) if len(values) != sparse_index.npoints: @@ -226,31 +218,25 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', " index".format(type(values))) # Create array, do *not* copy data by default if copy: - try: - # ToDo: Can remove this error handling when we actually - # support other dtypes - subarr = np.array(values, dtype=dtype, copy=True) - except ValueError: - subarr = np.array(values, copy=True) + subarr = np.array(values, dtype=dtype, copy=True) else: - try: - subarr = np.asarray(values, dtype=dtype) - except ValueError: - subarr = np.asarray(values) - - # if we have a bool type, make sure that we have a bool fill_value - if ((dtype is not None and issubclass(dtype.type, np.bool_)) or - (data is not None and lib.is_bool_array(subarr))): - if np.isnan(fill_value) or not fill_value: - fill_value = False - else: - fill_value = bool(fill_value) - + subarr = np.asarray(values, dtype=dtype) # Change the class of the array to be the subclass type. return cls._simple_new(subarr, sparse_index, fill_value) @classmethod def _simple_new(cls, data, sp_index, fill_value): + if not isinstance(sp_index, SparseIndex): + # caller must pass SparseIndex + raise ValueError('sp_index must be a SparseIndex') + + if fill_value is None: + if sp_index.ngaps > 0: + # has missing hole + fill_value = np.nan + else: + fill_value = na_value_for_dtype(data.dtype) + if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, @@ -318,7 +304,7 @@ def __array_finalize__(self, obj): def __reduce__(self): """Necessary for making this object picklable""" - object_state = list(ndarray.__reduce__(self)) + object_state = list(np.ndarray.__reduce__(self)) subclass_state = self.fill_value, self.sp_index object_state[2] = (object_state[2], subclass_state) return tuple(object_state) @@ -326,7 +312,7 @@ def __reduce__(self): def __setstate__(self, state): """Necessary for making this object picklable""" nd_state, own_state = state - ndarray.__setstate__(self, nd_state) + np.ndarray.__setstate__(self, nd_state) fill_value, sp_index = own_state[:2] self.sp_index = sp_index @@ -404,9 +390,11 @@ def __iter__(self): yield self._get_val_at(i) def __getitem__(self, key): + """ """ + if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): @@ -531,7 +519,11 @@ def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy) try: - fill_value = dtype.type(self.fill_value) + if is_bool_dtype(dtype): + # to avoid np.bool_ dtype + fill_value = bool(self.fill_value) + else: + fill_value = dtype.type(self.fill_value) except ValueError: msg = 'unable to coerce current fill_value {0} to {1} dtype' raise ValueError(msg.format(self.fill_value, dtype)) @@ -726,7 +718,7 @@ def _sanitize_values(arr): return arr -def make_sparse(arr, kind='block', fill_value=nan): +def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format @@ -746,6 +738,9 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + if isnull(fill_value): mask = notnull(arr) else: @@ -760,7 +755,7 @@ def make_sparse(arr, kind='block', fill_value=nan): index = _make_index(length, indices, kind) sparsified_values = arr[mask] - return sparsified_values, index + return sparsified_values, index, fill_value def _make_index(length, indices, kind): diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index f382a4b869a3e..8eeff045d1fac 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -11,6 +11,7 @@ import numpy as np from pandas.types.missing import isnull, notnull +from pandas.types.cast import _maybe_upcast from pandas.types.common import _ensure_platform_int from pandas.core.common import _try_sort @@ -22,12 +23,15 @@ import pandas.core.algorithms as algos from pandas.core.internals import (BlockManager, create_block_manager_from_arrays) -from pandas.core.generic import NDFrame +import pandas.core.generic as generic from pandas.sparse.series import SparseSeries, SparseArray from pandas.util.decorators import Appender import pandas.core.ops as ops +_shared_doc_kwargs = dict(klass='SparseDataFrame') + + class SparseDataFrame(DataFrame): """ DataFrame containing sparse floating point data in the form of SparseSeries @@ -118,7 +122,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if dtype is not None: mgr = mgr.astype(dtype) - NDFrame.__init__(self, mgr) + generic.NDFrame.__init__(self, mgr) @property def _constructor(self): @@ -509,7 +513,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): new_data, index=self.index, columns=union, default_fill_value=self.default_fill_value).__finalize__(self) - def _combine_const(self, other, func): + def _combine_const(self, other, func, raise_on_error=True): return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, @@ -542,6 +546,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new = values.take(indexer) if need_mask: new = new.values + # convert integer to float if necessary. need to do a lot + # more than that, handle boolean etc also + new, fill_value = _maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new @@ -686,6 +693,14 @@ def cumsum(self, axis=0, *args, **kwargs): return self.apply(lambda x: x.cumsum(), axis=axis) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + return self._apply_columns(lambda x: x.isnull()) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + return self._apply_columns(lambda x: x.isnotnull()) + def apply(self, func, axis=0, broadcast=False, reduce=False): """ Analogous to DataFrame.apply, for SparseDataFrame diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 888dbde8ffb0f..ad9168890b8f2 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -8,7 +8,7 @@ import numpy as np import warnings -from pandas.types.missing import isnull +from pandas.types.missing import isnull, notnull from pandas.types.common import is_scalar from pandas.core.common import _values_from_object, _maybe_match_name @@ -91,7 +91,8 @@ class SparseSeries(Series): data : {array-like, Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -125,26 +126,20 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if isinstance(data, Series) and name is None: name = data.name - is_sparse_array = isinstance(data, SparseArray) - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = np.nan - - if is_sparse_array: - if isinstance(data, SparseSeries) and index is None: - index = data.index.view() - elif index is not None: + if isinstance(data, SparseArray): + if index is not None: assert (len(index) == len(data)) - sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() - + if fill_value is None: + fill_value = data.fill_value # extract the SingleBlockManager data = data._data @@ -153,14 +148,14 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', index = data.index.view() data = Series(data) - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) @@ -636,6 +631,20 @@ def cumsum(self, axis=0, *args, **kwargs): # TODO: gh-12855 - return a SparseSeries here return Series(new_array, index=self.index).__finalize__(self) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + arr = SparseArray(isnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=isnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + arr = SparseArray(notnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=notnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + def dropna(self, axis=0, inplace=False, **kwargs): """ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index def3d15a43f0f..f24244b38c42b 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -357,6 +357,65 @@ def test_bool_array_logical(self): fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) + def test_mixed_array_float_int(self): + + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + def test_mixed_array_comparison(self): + + # int32 NI ATM + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 63e29656b66ea..dd86e9e791e5e 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -30,9 +30,13 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + self.assertEqual(arr.dtype, np.float64) + self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) @@ -40,7 +44,7 @@ def test_constructor_dtype(self): arr = SparseArray([0, 1, 2, 4], dtype=None) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) self.assertEqual(arr.dtype, np.int64) @@ -63,13 +67,13 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertTrue(np.isnan(arr.fill_value)) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=np.int64) - exp = SparseArray([0, 1, 2, 3], dtype=np.int64) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, fill_value=0) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) @@ -78,22 +82,20 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=None) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) # scalar input - arr = SparseArray(data=1, - sparse_index=IntIndex(1, [0]), - dtype=None) + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) @@ -576,44 +578,63 @@ def test_generator_warnings(self): def test_fillna(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=0) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0]) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=0) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan]) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=0) + exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) - s = SparseArray([0, 0, 0, 0]) + # float dtype's fill_value is np.nan, replaced by -1 + s = SparseArray([0., 0., 0., 0.]) res = s.fillna(-1) - exp = SparseArray([0, 0, 0, 0], fill_value=-1) + exp = SparseArray([0., 0., 0., 0.], fill_value=-1) tm.assert_sp_array_equal(res, exp) + # int dtype shouldn't have missing. No changes. + s = SparseArray([0, 0, 0, 0]) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + res = s.fillna(-1) + tm.assert_sp_array_equal(res, s) + s = SparseArray([0, 0, 0, 0], fill_value=0) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=0) tm.assert_sp_array_equal(res, exp) + # fill_value can be nan if there is no missing hole. + # only fill_value will be changed + s = SparseArray([0, 0, 0, 0], fill_value=np.nan) + self.assertEqual(s.dtype, np.int64) + self.assertTrue(np.isnan(s.fill_value)) + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) # filling with existing value doesn't replace existing value with @@ -624,7 +645,7 @@ def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(3) - exp = SparseArray([1, 3, 3, 3, 3], fill_value=0) + exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py index 9bdc1fdd101ea..377eaa20565a2 100644 --- a/pandas/sparse/tests/test_format.py +++ b/pandas/sparse/tests/test_format.py @@ -13,7 +13,7 @@ use_32bit_repr = is_platform_windows() or is_platform_32bit() -class TestSeriesFormatting(tm.TestCase): +class TestSparseSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True @@ -62,3 +62,59 @@ def test_sparse_mi_max_row(self): "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) + + def test_sparse_bool(self): + # GH 13110 + s = pd.SparseSeries([True, False, False, True, False, False], + fill_value=False) + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 True\n1 False\n2 False\n" + "3 True\n4 False\n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 True\n ... \n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_int(self): + # GH 13110 + s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) + + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" + "5 0\ndtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 0\n ..\n5 0\n" + "dtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + +class TestSparseDataFrameFormatting(tm.TestCase): + + def test_sparse_frame(self): + # GH 13110 + df = pd.DataFrame({'A': [True, False, True, False, True], + 'B': [True, False, True, False, True], + 'C': [0, 0, 3, 0, 5], + 'D': [np.nan, np.nan, np.nan, 1, 2]}) + sparse = df.to_sparse() + self.assertEqual(repr(sparse), repr(df)) + + with option_context("display.max_rows", 3): + self.assertEqual(repr(sparse), repr(df)) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 67b108c5dc648..192f6532a148d 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -25,10 +25,9 @@ class TestSparseDataFrame(tm.TestCase, SharedWithSparse): _multiprocess_can_split_ = True def setUp(self): - self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) @@ -125,10 +124,12 @@ def test_constructor(self): default_fill_value=self.frame.default_fill_value, default_kind=self.frame.default_kind, copy=True) reindexed = self.frame.reindex(idx) + tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) # assert level parameter breaks reindex - self.assertRaises(TypeError, self.frame.reindex, idx, level=0) + with tm.assertRaises(TypeError): + self.frame.reindex(idx, level=0) repr(self.frame) @@ -569,18 +570,23 @@ def test_apply(self): self.frame.to_dense().apply(nanops.nansum)) def test_apply_nonuq(self): - df_orig = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) - df = df_orig.to_sparse() - rs = df.apply(lambda s: s[0], axis=1) - xp = Series([1., 4., 7.], ['a', 'a', 'c']) - tm.assert_series_equal(rs, xp) + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + # dtype must be kept + self.assertEqual(res.dtype, np.int64) + # ToDo: apply must return subclassed dtype + self.assertIsInstance(res, pd.Series) + tm.assert_series_equal(res.to_dense(), exp) # df.T breaks - df = df_orig.T.to_sparse() - rs = df.apply(lambda s: s[0], axis=0) # noqa + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) # TODO: no non-unique columns supported in sparse yet - # assert_series_equal(rs, xp) + # tm.assert_series_equal(res.to_dense(), exp) def test_applymap(self): # just test that it works @@ -596,8 +602,10 @@ def test_astype(self): self.assertEqual(sparse['B'].dtype, np.int64) res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]), - 'B': SparseArray([4., 5., 6., 7.])}, + exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], + fill_value=0.), + 'B': SparseArray([4., 5., 6., 7.], + fill_value=0.)}, default_fill_value=np.nan) tm.assert_sp_frame_equal(res, exp) self.assertEqual(res['A'].dtype, np.float64) @@ -612,8 +620,10 @@ def test_astype(self): self.assertEqual(sparse['B'].dtype, np.int64) res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]), - 'B': SparseArray([0., 5., 0., 7.])}, + exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.], + fill_value=0.), + 'B': SparseArray([0., 5., 0., 7.], + fill_value=0.)}, default_fill_value=0.) tm.assert_sp_frame_equal(res, exp) self.assertEqual(res['A'].dtype, np.float64) @@ -813,6 +823,10 @@ def _check(frame, orig): untransposed = transposed.T tm.assert_sp_frame_equal(frame, untransposed) + tm.assert_frame_equal(frame.T.to_dense(), orig.T) + tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T) + tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False) + self._check_all(_check) def test_shift(self): @@ -821,8 +835,8 @@ def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) - # int is coerced to float dtype - tm.assert_frame_equal(shifted.to_dense(), exp, check_dtype=False) + tm.assert_frame_equal(shifted.to_dense(), exp) + shifted = frame.shift(1) exp = orig.shift(1) tm.assert_frame_equal(shifted, exp) @@ -932,12 +946,85 @@ def test_nan_columnname(self): nan_colname_sparse = nan_colname.to_sparse() self.assertTrue(np.isnan(nan_colname_sparse.columns[0])) + def test_isnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnull() + exp = pd.SparseDataFrame({'A': [True, True, False, False, True], + 'B': [False, True, True, False, True]}, + default_fill_value=True) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [False, False, False, False, True], + 'B': [False, True, False, False, True]}) + tm.assert_frame_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnotnull() + exp = pd.SparseDataFrame({'A': [False, False, True, True, False], + 'B': [True, False, False, True, False]}, + default_fill_value=False) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnotnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [True, True, True, True, False], + 'B': [True, False, True, True, False]}) + tm.assert_frame_equal(res.to_dense(), exp) + + +class TestSparseDataFrameArithmetic(tm.TestCase): + + def test_numeric_op_scalar(self): + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) + + def test_comparison_op_scalar(self): + # GH 13001 + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + # comparison changes internal repr, compare with dense + res = sparse > 1 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df > 1) + + res = sparse != 0 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df != 0) + class TestSparseDataFrameAnalytics(tm.TestCase): def setUp(self): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=float), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index d176d95bb7dbf..c0d4b70c41dc4 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -49,6 +49,21 @@ def test_getitem_slice(self): tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse()) + def test_getitem_int_dtype(self): + # GH 8292 + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], + fill_value=0, name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + def test_getitem_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 4417411403baa..c289b4a1b204f 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -243,6 +243,61 @@ class TestSparseIndexCommon(tm.TestCase): _multiprocess_can_split_ = True + def test_int_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.indices, + np.array([2, 3], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.indices, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.indices, + np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.blocs, + np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([2], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.blocs, + np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([4], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 3) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([1, 2], dtype=np.int32)) + def test_lookup(self): for kind in ['integer', 'block']: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 95361a8899c46..9d5a1327da53f 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -98,10 +98,14 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) - arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64, fill_value=np.nan) self.assertEqual(arr.dtype, np.int64) self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + arr = SparseSeries([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) @@ -354,7 +358,19 @@ def test_shape(self): self.assertEqual(self.ziseries2.shape, (15, )) def test_astype(self): - self.assertRaises(Exception, self.bseries.astype, np.int64) + with tm.assertRaises(ValueError): + self.bseries.astype(np.int64) + + def test_astype_all(self): + orig = pd.Series(np.array([1, 2, 3])) + s = SparseSeries(orig) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = s.astype(typ) + self.assertEqual(res.dtype, typ) + tm.assert_series_equal(res.to_dense(), orig.astype(typ)) def test_kind(self): self.assertEqual(self.bseries.kind, 'block') @@ -766,7 +782,8 @@ def _check_matches(indices, expected): data = {} for i, idx in enumerate(indices): data[i] = SparseSeries(idx.to_int_index().indices, - sparse_index=idx) + sparse_index=idx, fill_value=np.nan) + # homogenized is only valid with NaN fill values homogenized = spf.homogenize(data) for k, v in compat.iteritems(homogenized): @@ -866,9 +883,14 @@ def test_shift_nan(self): def test_shift_dtype(self): # GH 12908 orig = pd.Series([1, 2, 3, 4], dtype=np.int64) - sparse = orig.to_sparse() + sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) + + sparse = orig.to_sparse(fill_value=np.nan) + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=np.nan)) + # shift(1) or more span changes dtype to float64 tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse()) tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse()) tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse()) @@ -881,25 +903,27 @@ def test_shift_dtype(self): def test_shift_dtype_fill_value(self): # GH 12908 orig = pd.Series([1, 0, 0, 4], dtype=np.int64) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.shift(0), - orig.shift(0).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(1), - orig.shift(1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(2), - orig.shift(2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(3), - orig.shift(3).to_sparse(fill_value=0)) - - tm.assert_sp_series_equal(sparse.shift(-1), - orig.shift(-1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-2), - orig.shift(-2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-3), - orig.shift(-3).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-4), - orig.shift(-4).to_sparse(fill_value=0)) + for v in [0, 1, np.nan]: + sparse = orig.to_sparse(fill_value=v) + + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(1), + orig.shift(1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(2), + orig.shift(2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(3), + orig.shift(3).to_sparse(fill_value=v)) + + tm.assert_sp_series_equal(sparse.shift(-1), + orig.shift(-1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-2), + orig.shift(-2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-3), + orig.shift(-3).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-4), + orig.shift(-4).to_sparse(fill_value=v)) def test_combine_first(self): s = self.bseries @@ -1247,6 +1271,40 @@ def test_value_counts_int(self): tm.assert_series_equal(sparse.value_counts(dropna=False), dense.value_counts(dropna=False)) + def test_isnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnull() + exp = pd.SparseSeries([True, True, False, False, True], name='xxx', + fill_value=True) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([True, False, False, False, False], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnotnull() + exp = pd.SparseSeries([False, False, True, True, False], name='xxx', + fill_value=False) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnotnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([False, True, True, True, True], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + def _dense_series_compare(s, f): result = f(s) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 440e433ffd95c..cc07c7d9dd59b 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -40,16 +40,33 @@ class TestSparseSeriesSubclassing(tm.TestCase): _multiprocess_can_split_ = True def test_subclass_sparse_slice(self): + # int64 s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) - tm.assert_sp_series_equal(s.loc[1:3], - tm.SubclassedSparseSeries([2.0, 3.0, 4.0], - index=[1, 2, 3])) - tm.assert_sp_series_equal(s.iloc[1:3], - tm.SubclassedSparseSeries([2.0, 3.0], - index=[1, 2])) - tm.assert_sp_series_equal(s[1:3], - tm.SubclassedSparseSeries([2.0, 3.0], - index=[1, 2])) + exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.int64) + + # float64 + s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.]) + exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.float64) def test_subclass_sparse_addition(self): s1 = tm.SubclassedSparseSeries([1, 3, 5]) @@ -66,9 +83,17 @@ def test_subclass_sparse_to_frame(self): s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') res = s.to_frame() - exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block') + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', + fill_value=0) exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, - index=list('abcd')) + index=list('abcd'), + default_fill_value=0) + tm.assert_sp_frame_equal(res, exp) + + # create from int dict + res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, + index=list('abcd'), + default_fill_value=0) tm.assert_sp_frame_equal(res, exp) s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('abcd'), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d39569ea0b826..d50a6c460ceb5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1404,9 +1404,8 @@ def assert_sp_array_equal(left, right): assert_numpy_array_equal(left.values, right.values) -def assert_sp_series_equal(left, right, exact_indices=True, - check_series_type=True, - check_names=True, +def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, + check_series_type=True, check_names=True, obj='SparseSeries'): """Check that the left and right SparseSeries are equal. @@ -1414,6 +1413,8 @@ def assert_sp_series_equal(left, right, exact_indices=True, ---------- left : SparseSeries right : SparseSeries + check_dtype : bool, default True + Whether to check the Series dtype is identical. exact_indices : bool, default True check_series_type : bool, default True Whether to check the SparseSeries class is identical. @@ -1436,20 +1437,22 @@ def assert_sp_series_equal(left, right, exact_indices=True, if check_names: assert_attr_equal('name', left, right) - assert_attr_equal('dtype', left, right) + if check_dtype: + assert_attr_equal('dtype', left, right) assert_numpy_array_equal(left.values, right.values) -def assert_sp_frame_equal(left, right, exact_indices=True, - check_frame_type=True, - obj='SparseDataFrame'): +def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, + check_frame_type=True, obj='SparseDataFrame'): """Check that the left and right SparseDataFrame are equal. Parameters ---------- left : SparseDataFrame right : SparseDataFrame + check_dtype : bool, default True + Whether to check the Series dtype is identical. exact_indices : bool, default True SparseSeries SparseIndex objects must be exactly the same, otherwise just compare dense representations. @@ -1475,9 +1478,11 @@ def assert_sp_frame_equal(left, right, exact_indices=True, # trade-off? if exact_indices: - assert_sp_series_equal(series, right[col]) + assert_sp_series_equal(series, right[col], + check_dtype=check_dtype) else: - assert_series_equal(series.to_dense(), right[col].to_dense()) + assert_series_equal(series.to_dense(), right[col].to_dense(), + check_dtype=check_dtype) assert_attr_equal('default_fill_value', left, right, obj=obj)