pandas-dev · sinhrks · Jul 31, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -323,7 +323,24 @@ These changes allow pandas to handle sparse data with more dtypes, and for work
 
    s + 1
 
+- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`)
 
+.. ipython:: python
+
+   s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0)
+   s
+   s.astype(np.int64)
+
+``astype`` fails if data contains values which cannot be converted to specified ``dtype``.
+Note that the limitation is applied to ``fill_value`` which default is ``np.nan``.
+
+.. code-block:: ipython
+
+   In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64)
+   Out[7]:
+   ValueError: unable to coerce current fill_value nan to int64 dtype
+
+- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
 - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
@@ -411,7 +428,7 @@ API changes
 - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
 - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
 - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
-- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
+
 
 
 

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2504,6 +2504,14 @@ def sp_index(self):
     def kind(self):
         return self.values.kind
 
+    def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
+                klass=None, mgr=None, **kwargs):
+        if values is None:
+            values = self.values
+        values = values.astype(dtype, copy=copy)
+        return self.make_block_same_class(values=values,
+                                          placement=self.mgr_locs)
+
     def __len__(self):
         try:
             return self.sp_index.length
@@ -2521,7 +2529,7 @@ def make_block_same_class(self, values, placement, sparse_index=None,
                               copy=False, fastpath=True, **kwargs):
         """ return a new block """
         if dtype is None:
-            dtype = self.dtype
+            dtype = values.dtype
         if fill_value is None and not isinstance(values, SparseArray):
             fill_value = self.values.fill_value
 

diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -18,8 +18,9 @@
 from pandas.types.common import (is_float, is_integer,
                                  is_integer_dtype, _ensure_platform_int,
                                  is_list_like,
-                                 is_scalar)
-from pandas.types.cast import _possibly_convert_platform
+                                 is_scalar, is_dtype_equal)
+from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
+                               _astype_nansafe)
 from pandas.types.missing import isnull, notnull
 
 from pandas._sparse import SparseIndex, BlockIndex, IntIndex
@@ -236,7 +237,7 @@ def _simple_new(cls, data, sp_index, fill_value):
             raise ValueError('sp_index must be a SparseIndex')
 
         result.sp_index = sp_index
-        result.fill_value = fill_value
+        result._fill_value = fill_value
         return result
 
     @property
@@ -285,7 +286,7 @@ def __array_finalize__(self, obj):
         to pass on the index.
         """
         self.sp_index = getattr(obj, 'sp_index', None)
-        self.fill_value = getattr(obj, 'fill_value', None)
+        self._fill_value = getattr(obj, 'fill_value', None)
 
     def __reduce__(self):
         """Necessary for making this object picklable"""
@@ -301,7 +302,7 @@ def __setstate__(self, state):
 
         fill_value, sp_index = own_state[:2]
         self.sp_index = sp_index
-        self.fill_value = fill_value
+        self._fill_value = fill_value
 
     def __len__(self):
         try:
@@ -344,6 +345,22 @@ def sp_values(self):
         # caching not an option, leaks memory
         return self.view(np.ndarray)
 
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    @fill_value.setter
+    def fill_value(self, value):
+        if not is_scalar(value):
+            raise ValueError('fill_value must be a scalar')
+        # if the specified value triggers type promotion, raise ValueError
+        new_dtype, fill_value = _maybe_promote(self.dtype, value)
+        if is_dtype_equal(self.dtype, new_dtype):
+            self._fill_value = fill_value
+        else:
+            msg = 'unable to set fill_value {0} to {1} dtype'
+            raise ValueError(msg.format(value, self.dtype))
+
     def get_values(self, fill=None):
         """ return a dense representation """
         return self.to_dense(fill=fill)
@@ -479,19 +496,16 @@ def __setslice__(self, i, j, value):
         raise TypeError("SparseArray does not support item assignment via "
                         "slices")
 
-    def astype(self, dtype=None):
-        """
-
-        """
+    def astype(self, dtype=None, copy=True):
         dtype = np.dtype(dtype)
-        if dtype is not None and dtype not in (np.float_, float):
-            raise TypeError('Can only support floating point data for now')
-
-        if self.dtype == dtype:
-            return self.copy()
-        else:
-            return self._simple_new(self.sp_values.astype(dtype),
-                                    self.sp_index, float(self.fill_value))
+        sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy)
+        try:
+            fill_value = dtype.type(self.fill_value)
+        except ValueError:
+            msg = 'unable to coerce current fill_value {0} to {1} dtype'
+            raise ValueError(msg.format(self.fill_value, dtype))
+        return self._simple_new(sp_values, self.sp_index,
+                                fill_value=fill_value)
 
     def copy(self, deep=True):
         """

diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -235,8 +235,19 @@ def to_dense(self):
         data = dict((k, v.to_dense()) for k, v in compat.iteritems(self))
         return DataFrame(data, index=self.index, columns=self.columns)
 
+    def _apply_columns(self, func):
+        """ get new SparseDataFrame applying func to each columns """
+
+        new_data = {}
+        for col, series in compat.iteritems(self):
+            new_data[col] = func(series)
+
+        return self._constructor(
+            data=new_data, index=self.index, columns=self.columns,
+            default_fill_value=self.default_fill_value).__finalize__(self)
+
     def astype(self, dtype):
-        raise NotImplementedError
+        return self._apply_columns(lambda x: x.astype(dtype))
 
     def copy(self, deep=True):
         """
@@ -499,13 +510,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None):
             default_fill_value=self.default_fill_value).__finalize__(self)
 
     def _combine_const(self, other, func):
-        new_data = {}
-        for col, series in compat.iteritems(self):
-            new_data[col] = func(series, other)
-
-        return self._constructor(
-            data=new_data, index=self.index, columns=self.columns,
-            default_fill_value=self.default_fill_value).__finalize__(self)
+        return self._apply_columns(lambda x: func(x, other))
 
     def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                        limit=None, takeable=False):

diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py
@@ -324,7 +324,68 @@ def test_astype(self):
         res.sp_values[:3] = 27
         self.assertFalse((self.arr.sp_values[:3] == 27).any())
 
-        assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8')
+        msg = "unable to coerce current fill_value nan to int64 dtype"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.arr.astype('i8')
+
+        arr = SparseArray([0, np.nan, 0, 1])
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.astype('i8')
+
+        arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
+        msg = "Cannot convert NA to integer"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.astype('i8')
+
+    def test_astype_all(self):
+        vals = np.array([1, 2, 3])
+        arr = SparseArray(vals, fill_value=1)
+
+        types = [np.float64, np.float32, np.int64,
+                 np.int32, np.int16, np.int8]
+        for typ in types:
+            res = arr.astype(typ)
+            self.assertEqual(res.dtype, typ)
+            self.assertEqual(res.sp_values.dtype, typ)
+
+            tm.assert_numpy_array_equal(res.values, vals.astype(typ))
+
+    def test_set_fill_value(self):
+        arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
+        arr.fill_value = 2
+        self.assertEqual(arr.fill_value, 2)
+
+        arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
+        arr.fill_value = 2
+        self.assertEqual(arr.fill_value, 2)
+
+        # coerces to int
+        msg = "unable to set fill_value 3\\.1 to int64 dtype"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.fill_value = 3.1
+
+        msg = "unable to set fill_value nan to int64 dtype"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.fill_value = np.nan
+
+        arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool)
+        arr.fill_value = True
+        self.assertTrue(arr.fill_value)
+
+        # coerces to bool
+        msg = "unable to set fill_value 0 to bool dtype"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.fill_value = 0
+
+        msg = "unable to set fill_value nan to bool dtype"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            arr.fill_value = np.nan
+
+        # invalid
+        msg = "fill_value must be a scalar"
+        for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]:
+            with tm.assertRaisesRegexp(ValueError, msg):
+                arr.fill_value = val
 
     def test_copy_shallow(self):
         arr2 = self.arr.copy(deep=False)

diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py
@@ -15,7 +15,7 @@
 import pandas.sparse.frame as spf
 
 from pandas._sparse import BlockIndex, IntIndex
-from pandas.sparse.api import SparseSeries, SparseDataFrame
+from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray
 from pandas.tests.frame.test_misc_api import SharedWithSparse
 
 
@@ -588,7 +588,59 @@ def test_applymap(self):
         tm.assertIsInstance(result, SparseDataFrame)
 
     def test_astype(self):
-        self.assertRaises(Exception, self.frame.astype, np.int64)
+        sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
+                                                      dtype=np.int64),
+                                     'B': SparseArray([4, 5, 6, 7],
+                                                      dtype=np.int64)})
+        self.assertEqual(sparse['A'].dtype, np.int64)
+        self.assertEqual(sparse['B'].dtype, np.int64)
+
+        res = sparse.astype(np.float64)
+        exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]),
+                                  'B': SparseArray([4., 5., 6., 7.])},
+                                 default_fill_value=np.nan)
+        tm.assert_sp_frame_equal(res, exp)
+        self.assertEqual(res['A'].dtype, np.float64)
+        self.assertEqual(res['B'].dtype, np.float64)
+
+        sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
+                                                      dtype=np.int64),
+                                     'B': SparseArray([0, 5, 0, 7],
+                                                      dtype=np.int64)},
+                                    default_fill_value=0)
+        self.assertEqual(sparse['A'].dtype, np.int64)
+        self.assertEqual(sparse['B'].dtype, np.int64)
+
+        res = sparse.astype(np.float64)
+        exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]),
+                                  'B': SparseArray([0., 5., 0., 7.])},
+                                 default_fill_value=0.)
+        tm.assert_sp_frame_equal(res, exp)
+        self.assertEqual(res['A'].dtype, np.float64)
+        self.assertEqual(res['B'].dtype, np.float64)
+
+    def test_astype_bool(self):
+        sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
+                                                      fill_value=0,
+                                                      dtype=np.int64),
+                                     'B': SparseArray([0, 5, 0, 7],
+                                                      fill_value=0,
+                                                      dtype=np.int64)},
+                                    default_fill_value=0)
+        self.assertEqual(sparse['A'].dtype, np.int64)
+        self.assertEqual(sparse['B'].dtype, np.int64)
+
+        res = sparse.astype(bool)
+        exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
+                                                   dtype=np.bool,
+                                                   fill_value=False),
+                                  'B': SparseArray([False, True, False, True],
+                                                   dtype=np.bool,
+                                                   fill_value=False)},
+                                 default_fill_value=False)
+        tm.assert_sp_frame_equal(res, exp)
+        self.assertEqual(res['A'].dtype, np.bool)
+        self.assertEqual(res['B'].dtype, np.bool)
 
     def test_fillna(self):
         df = self.zframe.reindex(lrange(5))

diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py
@@ -797,7 +797,8 @@ def test_fill_value_corner(self):
         cop2 = self.zbseries.copy()
         cop2.fill_value = 1
         result = cop2 / cop
-        self.assertEqual(result.fill_value, np.inf)
+        # 1 / 0 is inf
+        self.assertTrue(np.isinf(result.fill_value))
 
     def test_fill_value_when_combine_const(self):
         # GH12723