Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: sparse astype now supports int64 and bool #13900

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,24 @@ These changes allow pandas to handle sparse data with more dtypes, and for work

s + 1

- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`)

.. ipython:: python

s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0)
s
s.astype(np.int64)

``astype`` fails if data contains values which cannot be converted to specified ``dtype``.
Note that the limitation is applied to ``fill_value`` which default is ``np.nan``.

.. code-block:: ipython

In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64)
Out[7]:
ValueError: unable to coerce current fill_value nan to int64 dtype

- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
Expand Down Expand Up @@ -411,7 +428,7 @@ API changes
- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)




Expand Down
10 changes: 9 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2504,6 +2504,14 @@ def sp_index(self):
def kind(self):
return self.values.kind

def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
klass=None, mgr=None, **kwargs):
if values is None:
values = self.values
values = values.astype(dtype, copy=copy)
return self.make_block_same_class(values=values,
placement=self.mgr_locs)

def __len__(self):
try:
return self.sp_index.length
Expand All @@ -2521,7 +2529,7 @@ def make_block_same_class(self, values, placement, sparse_index=None,
copy=False, fastpath=True, **kwargs):
""" return a new block """
if dtype is None:
dtype = self.dtype
dtype = values.dtype
if fill_value is None and not isinstance(values, SparseArray):
fill_value = self.values.fill_value

Expand Down
48 changes: 31 additions & 17 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from pandas.types.common import (is_float, is_integer,
is_integer_dtype, _ensure_platform_int,
is_list_like,
is_scalar)
from pandas.types.cast import _possibly_convert_platform
is_scalar, is_dtype_equal)
from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
_astype_nansafe)
from pandas.types.missing import isnull, notnull

from pandas._sparse import SparseIndex, BlockIndex, IntIndex
Expand Down Expand Up @@ -236,7 +237,7 @@ def _simple_new(cls, data, sp_index, fill_value):
raise ValueError('sp_index must be a SparseIndex')

result.sp_index = sp_index
result.fill_value = fill_value
result._fill_value = fill_value
return result

@property
Expand Down Expand Up @@ -285,7 +286,7 @@ def __array_finalize__(self, obj):
to pass on the index.
"""
self.sp_index = getattr(obj, 'sp_index', None)
self.fill_value = getattr(obj, 'fill_value', None)
self._fill_value = getattr(obj, 'fill_value', None)

def __reduce__(self):
"""Necessary for making this object picklable"""
Expand All @@ -301,7 +302,7 @@ def __setstate__(self, state):

fill_value, sp_index = own_state[:2]
self.sp_index = sp_index
self.fill_value = fill_value
self._fill_value = fill_value

def __len__(self):
try:
Expand Down Expand Up @@ -344,6 +345,22 @@ def sp_values(self):
# caching not an option, leaks memory
return self.view(np.ndarray)

@property
def fill_value(self):
return self._fill_value

@fill_value.setter
def fill_value(self, value):
if not is_scalar(value):
raise ValueError('fill_value must be a scalar')
# if the specified value triggers type promotion, raise ValueError
new_dtype, fill_value = _maybe_promote(self.dtype, value)
if is_dtype_equal(self.dtype, new_dtype):
self._fill_value = fill_value
else:
msg = 'unable to set fill_value {0} to {1} dtype'
raise ValueError(msg.format(value, self.dtype))

def get_values(self, fill=None):
""" return a dense representation """
return self.to_dense(fill=fill)
Expand Down Expand Up @@ -479,19 +496,16 @@ def __setslice__(self, i, j, value):
raise TypeError("SparseArray does not support item assignment via "
"slices")

def astype(self, dtype=None):
"""

"""
def astype(self, dtype=None, copy=True):
dtype = np.dtype(dtype)
if dtype is not None and dtype not in (np.float_, float):
raise TypeError('Can only support floating point data for now')

if self.dtype == dtype:
return self.copy()
else:
return self._simple_new(self.sp_values.astype(dtype),
self.sp_index, float(self.fill_value))
sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy)
try:
fill_value = dtype.type(self.fill_value)
except ValueError:
msg = 'unable to coerce current fill_value {0} to {1} dtype'
raise ValueError(msg.format(self.fill_value, dtype))
return self._simple_new(sp_values, self.sp_index,
fill_value=fill_value)

def copy(self, deep=True):
"""
Expand Down
21 changes: 13 additions & 8 deletions pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,19 @@ def to_dense(self):
data = dict((k, v.to_dense()) for k, v in compat.iteritems(self))
return DataFrame(data, index=self.index, columns=self.columns)

def _apply_columns(self, func):
""" get new SparseDataFrame applying func to each columns """

new_data = {}
for col, series in compat.iteritems(self):
new_data[col] = func(series)

return self._constructor(
data=new_data, index=self.index, columns=self.columns,
default_fill_value=self.default_fill_value).__finalize__(self)

def astype(self, dtype):
raise NotImplementedError
return self._apply_columns(lambda x: x.astype(dtype))

def copy(self, deep=True):
"""
Expand Down Expand Up @@ -499,13 +510,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None):
default_fill_value=self.default_fill_value).__finalize__(self)

def _combine_const(self, other, func):
new_data = {}
for col, series in compat.iteritems(self):
new_data[col] = func(series, other)

return self._constructor(
data=new_data, index=self.index, columns=self.columns,
default_fill_value=self.default_fill_value).__finalize__(self)
return self._apply_columns(lambda x: func(x, other))

def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
limit=None, takeable=False):
Expand Down
63 changes: 62 additions & 1 deletion pandas/sparse/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,68 @@ def test_astype(self):
res.sp_values[:3] = 27
self.assertFalse((self.arr.sp_values[:3] == 27).any())

assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8')
msg = "unable to coerce current fill_value nan to int64 dtype"
with tm.assertRaisesRegexp(ValueError, msg):
self.arr.astype('i8')

arr = SparseArray([0, np.nan, 0, 1])
with tm.assertRaisesRegexp(ValueError, msg):
arr.astype('i8')

arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
msg = "Cannot convert NA to integer"
with tm.assertRaisesRegexp(ValueError, msg):
arr.astype('i8')

def test_astype_all(self):
vals = np.array([1, 2, 3])
arr = SparseArray(vals, fill_value=1)

types = [np.float64, np.float32, np.int64,
np.int32, np.int16, np.int8]
for typ in types:
res = arr.astype(typ)
self.assertEqual(res.dtype, typ)
self.assertEqual(res.sp_values.dtype, typ)

tm.assert_numpy_array_equal(res.values, vals.astype(typ))

def test_set_fill_value(self):
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
arr.fill_value = 2
self.assertEqual(arr.fill_value, 2)

arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
arr.fill_value = 2
self.assertEqual(arr.fill_value, 2)

# coerces to int
msg = "unable to set fill_value 3\\.1 to int64 dtype"
with tm.assertRaisesRegexp(ValueError, msg):
arr.fill_value = 3.1

msg = "unable to set fill_value nan to int64 dtype"
with tm.assertRaisesRegexp(ValueError, msg):
arr.fill_value = np.nan

arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool)
arr.fill_value = True
self.assertTrue(arr.fill_value)

# coerces to bool
msg = "unable to set fill_value 0 to bool dtype"
with tm.assertRaisesRegexp(ValueError, msg):
arr.fill_value = 0

msg = "unable to set fill_value nan to bool dtype"
with tm.assertRaisesRegexp(ValueError, msg):
arr.fill_value = np.nan

# invalid
msg = "fill_value must be a scalar"
for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]:
with tm.assertRaisesRegexp(ValueError, msg):
arr.fill_value = val

def test_copy_shallow(self):
arr2 = self.arr.copy(deep=False)
Expand Down
56 changes: 54 additions & 2 deletions pandas/sparse/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pandas.sparse.frame as spf

from pandas._sparse import BlockIndex, IntIndex
from pandas.sparse.api import SparseSeries, SparseDataFrame
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray
from pandas.tests.frame.test_misc_api import SharedWithSparse


Expand Down Expand Up @@ -588,7 +588,59 @@ def test_applymap(self):
tm.assertIsInstance(result, SparseDataFrame)

def test_astype(self):
self.assertRaises(Exception, self.frame.astype, np.int64)
sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
dtype=np.int64),
'B': SparseArray([4, 5, 6, 7],
dtype=np.int64)})
self.assertEqual(sparse['A'].dtype, np.int64)
self.assertEqual(sparse['B'].dtype, np.int64)

res = sparse.astype(np.float64)
exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]),
'B': SparseArray([4., 5., 6., 7.])},
default_fill_value=np.nan)
tm.assert_sp_frame_equal(res, exp)
self.assertEqual(res['A'].dtype, np.float64)
self.assertEqual(res['B'].dtype, np.float64)

sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
dtype=np.int64),
'B': SparseArray([0, 5, 0, 7],
dtype=np.int64)},
default_fill_value=0)
self.assertEqual(sparse['A'].dtype, np.int64)
self.assertEqual(sparse['B'].dtype, np.int64)

res = sparse.astype(np.float64)
exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]),
'B': SparseArray([0., 5., 0., 7.])},
default_fill_value=0.)
tm.assert_sp_frame_equal(res, exp)
self.assertEqual(res['A'].dtype, np.float64)
self.assertEqual(res['B'].dtype, np.float64)

def test_astype_bool(self):
sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
fill_value=0,
dtype=np.int64),
'B': SparseArray([0, 5, 0, 7],
fill_value=0,
dtype=np.int64)},
default_fill_value=0)
self.assertEqual(sparse['A'].dtype, np.int64)
self.assertEqual(sparse['B'].dtype, np.int64)

res = sparse.astype(bool)
exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
dtype=np.bool,
fill_value=False),
'B': SparseArray([False, True, False, True],
dtype=np.bool,
fill_value=False)},
default_fill_value=False)
tm.assert_sp_frame_equal(res, exp)
self.assertEqual(res['A'].dtype, np.bool)
self.assertEqual(res['B'].dtype, np.bool)

def test_fillna(self):
df = self.zframe.reindex(lrange(5))
Expand Down
3 changes: 2 additions & 1 deletion pandas/sparse/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,8 @@ def test_fill_value_corner(self):
cop2 = self.zbseries.copy()
cop2.fill_value = 1
result = cop2 / cop
self.assertEqual(result.fill_value, np.inf)
# 1 / 0 is inf
self.assertTrue(np.isinf(result.fill_value))

def test_fill_value_when_combine_const(self):
# GH12723
Expand Down