Skip to content

Commit

Permalink
BUG: SparseDataFrame indexing may return normal Series
Browse files Browse the repository at this point in the history
closes #12787
  • Loading branch information
sinhrks authored and jreback committed Apr 4, 2016
1 parent 610d3d5 commit a3a0942
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Expand Up @@ -77,6 +77,7 @@ These changes conform sparse handling to return the correct types and work to ma
- Bug in ``SparseSeries.loc[]`` with list-like input raises ``TypeError`` (:issue:`10560`)
- Bug in ``SparseSeries.iloc[]`` with scalar input may raise ``IndexError`` (:issue:`10560`)
- Bug in ``SparseSeries.loc[]``, ``.iloc[]`` with ``slice`` returns ``SparseArray``, rather than ``SparseSeries`` (:issue:`10560`)
- Bug in ``SparseDataFrame.loc[]``, ``.iloc[]`` may results in dense ``Series``, rather than ``SparseSeries`` (:issue:`12787`)
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/frame.py
Expand Up @@ -1915,8 +1915,10 @@ def _ixs(self, i, axis=0):
# if we are a copy, mark as such
copy = (isinstance(new_values, np.ndarray) and
new_values.base is None)
result = Series(new_values, index=self.columns,
name=self.index[i], dtype=new_values.dtype)
result = self._constructor_sliced(new_values,
index=self.columns,
name=self.index[i],
dtype=new_values.dtype)
result._set_is_copy(self, copy=copy)
return result

Expand Down
7 changes: 3 additions & 4 deletions pandas/core/generic.py
Expand Up @@ -1752,7 +1752,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
new_index = self.index[loc]

if lib.isscalar(loc):
from pandas import Series
new_values = self._data.fast_xs(loc)

# may need to box a datelike-scalar
Expand All @@ -1763,9 +1762,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
if not is_list_like(new_values) or self.ndim == 1:
return _maybe_box_datetimelike(new_values)

result = Series(new_values, index=self.columns,
name=self.index[loc], copy=copy,
dtype=new_values.dtype)
result = self._constructor_sliced(new_values, index=self.columns,
name=self.index[loc], copy=copy,
dtype=new_values.dtype)

else:
result = self.iloc[loc]
Expand Down
2 changes: 2 additions & 0 deletions pandas/sparse/frame.py
Expand Up @@ -136,6 +136,8 @@ def wrapper(data=None, index=None, columns=None,

return wrapper

_constructor_sliced = SparseSeries

def _init_dict(self, data, index, columns, dtype=None):
# pre-filter out columns if we passed it
if columns is not None:
Expand Down
162 changes: 162 additions & 0 deletions pandas/sparse/tests/test_indexing.py
Expand Up @@ -82,3 +82,165 @@ def test_iloc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())


class TestSparseDataFrameIndexing(tm.TestCase):

_multiprocess_can_split_ = True

def test_loc(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()

self.assertEqual(sparse.loc[0, 'x'], 1)
self.assertTrue(np.isnan(sparse.loc[1, 'z']))
self.assertEqual(sparse.loc[2, 'z'], 4)

tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse())
tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse())
tm.assert_sp_series_equal(sparse.loc[2, :],
orig.loc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.loc[2, :],
orig.loc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'y'],
orig.loc[:, 'y'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'y'],
orig.loc[:, 'y'].to_sparse())

result = sparse.loc[[1, 2]]
exp = orig.loc[[1, 2]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[[1, 2], :]
exp = orig.loc[[1, 2], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[:, ['x', 'z']]
exp = orig.loc[:, ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[[0, 2], ['x', 'z']]
exp = orig.loc[[0, 2], ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# exceeds the bounds
result = sparse.loc[[1, 3, 4, 5]]
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# dense array
result = sparse.loc[orig.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

def test_loc_index(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
index=list('abc'), columns=list('xyz'))
sparse = orig.to_sparse()

self.assertEqual(sparse.loc['a', 'x'], 1)
self.assertTrue(np.isnan(sparse.loc['b', 'z']))
self.assertEqual(sparse.loc['c', 'z'], 4)

tm.assert_sp_series_equal(sparse.loc['a'], orig.loc['a'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b'], orig.loc['b'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b', :],
orig.loc['b', :].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b', :],
orig.loc['b', :].to_sparse())

tm.assert_sp_series_equal(sparse.loc[:, 'z'],
orig.loc[:, 'z'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'z'],
orig.loc[:, 'z'].to_sparse())

result = sparse.loc[['a', 'b']]
exp = orig.loc[['a', 'b']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[['a', 'b'], :]
exp = orig.loc[['a', 'b'], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[:, ['x', 'z']]
exp = orig.loc[:, ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[['c', 'a'], ['x', 'z']]
exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# dense array
result = sparse.loc[orig.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

def test_loc_slice(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()
tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse())

def test_iloc(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]])
sparse = orig.to_sparse()

self.assertEqual(sparse.iloc[1, 1], 3)
self.assertTrue(np.isnan(sparse.iloc[2, 0]))

tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[2, :],
orig.iloc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[2, :],
orig.iloc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[:, 1],
orig.iloc[:, 1].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[:, 1],
orig.iloc[:, 1].to_sparse())

result = sparse.iloc[[1, 2]]
exp = orig.iloc[[1, 2]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[[1, 2], :]
exp = orig.iloc[[1, 2], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[:, [1, 0]]
exp = orig.iloc[:, [1, 0]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[[2], [1, 0]]
exp = orig.iloc[[2], [1, 0]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

with tm.assertRaises(IndexError):
sparse.iloc[[1, 3, 5]]

def test_iloc_slice(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()
tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())

0 comments on commit a3a0942

Please sign in to comment.