BUG: SparseDataFrame indexing may return normal Series

closes #12787
pandas-dev · Apr 4, 2016 · a3a0942 · a3a0942
1 parent 610d3d5
commit a3a0942
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -77,6 +77,7 @@ These changes conform sparse handling to return the correct types and work to ma
 - Bug in ``SparseSeries.loc[]`` with list-like input raises ``TypeError`` (:issue:`10560`)
 - Bug in ``SparseSeries.iloc[]`` with scalar input may raise ``IndexError`` (:issue:`10560`)
 - Bug in ``SparseSeries.loc[]``, ``.iloc[]`` with ``slice`` returns ``SparseArray``, rather than ``SparseSeries`` (:issue:`10560`)
+- Bug in ``SparseDataFrame.loc[]``, ``.iloc[]`` may results in dense ``Series``, rather than ``SparseSeries`` (:issue:`12787`)
 - Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
 - Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
 - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1915,8 +1915,10 @@ def _ixs(self, i, axis=0):
                     # if we are a copy, mark as such
                     copy = (isinstance(new_values, np.ndarray) and
                             new_values.base is None)
-                    result = Series(new_values, index=self.columns,
-                                    name=self.index[i], dtype=new_values.dtype)
+                    result = self._constructor_sliced(new_values,
+                                                      index=self.columns,
+                                                      name=self.index[i],
+                                                      dtype=new_values.dtype)
                 result._set_is_copy(self, copy=copy)
                 return result
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1752,7 +1752,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
                 new_index = self.index[loc]
 
         if lib.isscalar(loc):
-            from pandas import Series
             new_values = self._data.fast_xs(loc)
 
             # may need to box a datelike-scalar
@@ -1763,9 +1762,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
             if not is_list_like(new_values) or self.ndim == 1:
                 return _maybe_box_datetimelike(new_values)
 
-            result = Series(new_values, index=self.columns,
-                            name=self.index[loc], copy=copy,
-                            dtype=new_values.dtype)
+            result = self._constructor_sliced(new_values, index=self.columns,
+                                              name=self.index[loc], copy=copy,
+                                              dtype=new_values.dtype)
 
         else:
             result = self.iloc[loc]

diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -136,6 +136,8 @@ def wrapper(data=None, index=None, columns=None,
 
         return wrapper
 
+    _constructor_sliced = SparseSeries
+
     def _init_dict(self, data, index, columns, dtype=None):
         # pre-filter out columns if we passed it
         if columns is not None:

diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py
@@ -82,3 +82,165 @@ def test_iloc_slice(self):
         orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
         sparse = orig.to_sparse()
         tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())
+
+
+class TestSparseDataFrameIndexing(tm.TestCase):
+
+    _multiprocess_can_split_ = True
+
+    def test_loc(self):
+        orig = pd.DataFrame([[1, np.nan, np.nan],
+                             [2, 3, np.nan],
+                             [np.nan, np.nan, 4]],
+                            columns=list('xyz'))
+        sparse = orig.to_sparse()
+
+        self.assertEqual(sparse.loc[0, 'x'], 1)
+        self.assertTrue(np.isnan(sparse.loc[1, 'z']))
+        self.assertEqual(sparse.loc[2, 'z'], 4)
+
+        tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[2, :],
+                                  orig.loc[2, :].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[2, :],
+                                  orig.loc[2, :].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[:, 'y'],
+                                  orig.loc[:, 'y'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[:, 'y'],
+                                  orig.loc[:, 'y'].to_sparse())
+
+        result = sparse.loc[[1, 2]]
+        exp = orig.loc[[1, 2]].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[[1, 2], :]
+        exp = orig.loc[[1, 2], :].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[:, ['x', 'z']]
+        exp = orig.loc[:, ['x', 'z']].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[[0, 2], ['x', 'z']]
+        exp = orig.loc[[0, 2], ['x', 'z']].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        # exceeds the bounds
+        result = sparse.loc[[1, 3, 4, 5]]
+        exp = orig.loc[[1, 3, 4, 5]].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        # dense array
+        result = sparse.loc[orig.x % 2 == 1]
+        exp = orig.loc[orig.x % 2 == 1].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        # sparse array (actuary it coerces to normal Series)
+        result = sparse.loc[sparse.x % 2 == 1]
+        exp = orig.loc[orig.x % 2 == 1].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+    def test_loc_index(self):
+        orig = pd.DataFrame([[1, np.nan, np.nan],
+                             [2, 3, np.nan],
+                             [np.nan, np.nan, 4]],
+                            index=list('abc'), columns=list('xyz'))
+        sparse = orig.to_sparse()
+
+        self.assertEqual(sparse.loc['a', 'x'], 1)
+        self.assertTrue(np.isnan(sparse.loc['b', 'z']))
+        self.assertEqual(sparse.loc['c', 'z'], 4)
+
+        tm.assert_sp_series_equal(sparse.loc['a'], orig.loc['a'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['b'], orig.loc['b'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['b', :],
+                                  orig.loc['b', :].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['b', :],
+                                  orig.loc['b', :].to_sparse())
+
+        tm.assert_sp_series_equal(sparse.loc[:, 'z'],
+                                  orig.loc[:, 'z'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[:, 'z'],
+                                  orig.loc[:, 'z'].to_sparse())
+
+        result = sparse.loc[['a', 'b']]
+        exp = orig.loc[['a', 'b']].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[['a', 'b'], :]
+        exp = orig.loc[['a', 'b'], :].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[:, ['x', 'z']]
+        exp = orig.loc[:, ['x', 'z']].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.loc[['c', 'a'], ['x', 'z']]
+        exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        # dense array
+        result = sparse.loc[orig.x % 2 == 1]
+        exp = orig.loc[orig.x % 2 == 1].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        # sparse array (actuary it coerces to normal Series)
+        result = sparse.loc[sparse.x % 2 == 1]
+        exp = orig.loc[orig.x % 2 == 1].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+    def test_loc_slice(self):
+        orig = pd.DataFrame([[1, np.nan, np.nan],
+                             [2, 3, np.nan],
+                             [np.nan, np.nan, 4]],
+                            columns=list('xyz'))
+        sparse = orig.to_sparse()
+        tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse())
+
+    def test_iloc(self):
+        orig = pd.DataFrame([[1, np.nan, np.nan],
+                             [2, 3, np.nan],
+                             [np.nan, np.nan, 4]])
+        sparse = orig.to_sparse()
+
+        self.assertEqual(sparse.iloc[1, 1], 3)
+        self.assertTrue(np.isnan(sparse.iloc[2, 0]))
+
+        tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse())
+        tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse())
+        tm.assert_sp_series_equal(sparse.iloc[2, :],
+                                  orig.iloc[2, :].to_sparse())
+        tm.assert_sp_series_equal(sparse.iloc[2, :],
+                                  orig.iloc[2, :].to_sparse())
+        tm.assert_sp_series_equal(sparse.iloc[:, 1],
+                                  orig.iloc[:, 1].to_sparse())
+        tm.assert_sp_series_equal(sparse.iloc[:, 1],
+                                  orig.iloc[:, 1].to_sparse())
+
+        result = sparse.iloc[[1, 2]]
+        exp = orig.iloc[[1, 2]].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.iloc[[1, 2], :]
+        exp = orig.iloc[[1, 2], :].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.iloc[:, [1, 0]]
+        exp = orig.iloc[:, [1, 0]].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        result = sparse.iloc[[2], [1, 0]]
+        exp = orig.iloc[[2], [1, 0]].to_sparse()
+        tm.assert_sp_frame_equal(result, exp)
+
+        with tm.assertRaises(IndexError):
+            sparse.iloc[[1, 3, 5]]
+
+    def test_iloc_slice(self):
+        orig = pd.DataFrame([[1, np.nan, np.nan],
+                             [2, 3, np.nan],
+                             [np.nan, np.nan, 4]],
+                            columns=list('xyz'))
+        sparse = orig.to_sparse()
+        tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())