From 331db443888e4fa3b2205c82341ad9930fb2ac33 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Apr 2017 11:15:03 -0400 Subject: [PATCH] REGR: Bug in indexing with a CategoricalIndex (#16123) * REGR: Bug in indexing with a CategoricalIndex closes #16115 * some cleaning * BUG: scalar getitem with a CI closes #16131 --- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/core/indexes/category.py | 21 +++- pandas/tests/indexing/test_categorical.py | 137 ++++++++++++++-------- pandas/tests/reshape/test_reshape.py | 2 +- 4 files changed, 108 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3a48c94830597a..025ac7673622bf 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1631,7 +1631,8 @@ Indexing - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) - - Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) +- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) +- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`) I/O ^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5f38b19742f711..760db4ba206757 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -10,7 +10,8 @@ is_list_like, is_interval_dtype, is_scalar) -from pandas.core.common import _asarray_tuplesafe +from pandas.core.common import (_asarray_tuplesafe, + _values_from_object) from pandas.core.dtypes.missing import array_equivalent from pandas.core.algorithms import take_1d @@ -353,6 +354,22 @@ def get_loc(self, key, method=None): raise KeyError(key) return self._engine.get_loc(codes) + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + try: + k = _values_from_object(key) + k = self._convert_scalar_indexer(k, kind='getitem') + indexer = self.get_loc(k) + return series.iloc[indexer] + except (KeyError, TypeError): + pass + + # we might be a positional inexer + return super(CategoricalIndex, self).get_value(series, key) + def _can_reindex(self, indexer): """ always allow reindexing """ pass @@ -507,7 +524,7 @@ def _convert_list_indexer(self, keyarr, kind=None): indexer = self.categories._convert_list_indexer(keyarr, kind=kind) return Index(self.codes).get_indexer_for(indexer) - indexer = self.categories.get_indexer(keyarr) + indexer = self.categories.get_indexer(np.asarray(keyarr)) if (indexer == -1).any(): raise KeyError( "a list-indexer must only " diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index fd5557dfcb99ce..e0f95a1fd5c0df 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -4,7 +4,8 @@ import pandas as pd import numpy as np -from pandas import Series, DataFrame +from pandas import (Series, DataFrame, Timestamp, + Categorical, CategoricalIndex) from pandas.util.testing import assert_series_equal, assert_frame_equal from pandas.util import testing as tm @@ -66,6 +67,17 @@ def f(): pytest.raises(TypeError, f) + def test_getitem_scalar(self): + + cats = Categorical([Timestamp('12-31-1999'), + Timestamp('12-31-2000')]) + + s = Series([1, 2], index=cats) + + expected = s.iloc[0] + result = s[cats[0]] + assert result == expected + def test_loc_listlike(self): # list of labels @@ -74,7 +86,7 @@ def test_loc_listlike(self): assert_frame_equal(result, expected, check_index_type=True) result = self.df2.loc[['a', 'b', 'e']] - exp_index = pd.CategoricalIndex( + exp_index = CategoricalIndex( list('aaabbe'), categories=list('cabe'), name='B') expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) @@ -86,14 +98,14 @@ def test_loc_listlike(self): df = self.df2.copy() df.loc['e'] = 20 result = df.loc[['a', 'b', 'e']] - exp_index = pd.CategoricalIndex( + exp_index = CategoricalIndex( list('aaabbe'), categories=list('cabe'), name='B') expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index) assert_frame_equal(result, expected) df = self.df2.copy() result = df.loc[['a', 'b', 'e']] - exp_index = pd.CategoricalIndex( + exp_index = CategoricalIndex( list('aaabbe'), categories=list('cabe'), name='B') expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) @@ -105,21 +117,21 @@ def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes - index = pd.CategoricalIndex(['a', 'b', 'c']) + index = CategoricalIndex(['a', 'b', 'c']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] - exp_index = pd.CategoricalIndex(['a', 'b'], - categories=index.categories) + exp_index = CategoricalIndex(['a', 'b'], + categories=index.categories) exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] - exp_index = pd.CategoricalIndex(['a', 'a', 'b'], - categories=index.categories) + exp_index = CategoricalIndex(['a', 'a', 'b'], + categories=index.categories) exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) @@ -130,14 +142,14 @@ def test_loc_listlike_dtypes(self): df.loc[['a', 'x']] # duplicated categories and codes - index = pd.CategoricalIndex(['a', 'b', 'a']) + index = CategoricalIndex(['a', 'b', 'a']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] exp = DataFrame({'A': [1, 3, 2], 'B': [4, 6, 5]}, - index=pd.CategoricalIndex(['a', 'a', 'b'])) + index=CategoricalIndex(['a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice @@ -145,7 +157,7 @@ def test_loc_listlike_dtypes(self): exp = DataFrame( {'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5 - ]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) + ]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assertRaisesRegexp( @@ -155,27 +167,27 @@ def test_loc_listlike_dtypes(self): df.loc[['a', 'x']] # contains unused category - index = pd.CategoricalIndex( + index = CategoricalIndex( ['a', 'b', 'a', 'c'], categories=list('abcde')) df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 3, 2], - 'B': [5, 7, 6]}, index=pd.CategoricalIndex( - ['a', 'a', 'b'], categories=list('abcde'))) + exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]}, + index=CategoricalIndex(['a', 'a', 'b'], + categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[['a', 'e']] exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]}, - index=pd.CategoricalIndex(['a', 'a', 'e'], - categories=list('abcde'))) + index=CategoricalIndex(['a', 'a', 'e'], + categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]}, - index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'], - categories=list('abcde'))) + index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'], + categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assertRaisesRegexp( @@ -184,54 +196,77 @@ def test_loc_listlike_dtypes(self): 'that are in the categories'): df.loc[['a', 'x']] + def test_get_indexer_array(self): + arr = np.array([Timestamp('1999-12-31 00:00:00'), + Timestamp('2000-12-31 00:00:00')], dtype=object) + cats = [Timestamp('1999-12-31 00:00:00'), + Timestamp('2000-12-31 00:00:00')] + ci = CategoricalIndex(cats, + categories=cats, + ordered=False, dtype='category') + result = ci.get_indexer(arr) + expected = np.array([0, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + + def test_getitem_with_listlike(self): + # GH 16115 + cats = Categorical([Timestamp('12-31-1999'), + Timestamp('12-31-2000')]) + + expected = DataFrame([[1, 0], [0, 1]], dtype='uint8', + index=[0, 1], columns=cats) + dummies = pd.get_dummies(cats) + result = dummies[[c for c in dummies.columns]] + assert_frame_equal(result, expected) + def test_ix_categorical_index(self): # GH 12531 - df = pd.DataFrame(np.random.randn(3, 3), - index=list('ABC'), columns=list('XYZ')) + df = DataFrame(np.random.randn(3, 3), + index=list('ABC'), columns=list('XYZ')) cdf = df.copy() - cdf.index = pd.CategoricalIndex(df.index) - cdf.columns = pd.CategoricalIndex(df.columns) + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) - expect = pd.Series(df.loc['A', :], index=cdf.columns, name='A') + expect = Series(df.loc['A', :], index=cdf.columns, name='A') assert_series_equal(cdf.loc['A', :], expect) - expect = pd.Series(df.loc[:, 'X'], index=cdf.index, name='X') + expect = Series(df.loc[:, 'X'], index=cdf.index, name='X') assert_series_equal(cdf.loc[:, 'X'], expect) - exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) - expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=exp_index) + exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) + expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, + index=exp_index) assert_frame_equal(cdf.loc[['A', 'B'], :], expect) - exp_columns = pd.CategoricalIndex(list('XY'), - categories=['X', 'Y', 'Z']) - expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=exp_columns) + exp_columns = CategoricalIndex(list('XY'), + categories=['X', 'Y', 'Z']) + expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, + columns=exp_columns) assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) # non-unique - df = pd.DataFrame(np.random.randn(3, 3), - index=list('ABA'), columns=list('XYX')) + df = DataFrame(np.random.randn(3, 3), + index=list('ABA'), columns=list('XYX')) cdf = df.copy() - cdf.index = pd.CategoricalIndex(df.index) - cdf.columns = pd.CategoricalIndex(df.columns) + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) - exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B']) - expect = pd.DataFrame(df.loc['A', :], columns=cdf.columns, - index=exp_index) + exp_index = CategoricalIndex(list('AA'), categories=['A', 'B']) + expect = DataFrame(df.loc['A', :], columns=cdf.columns, + index=exp_index) assert_frame_equal(cdf.loc['A', :], expect) - exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y']) - expect = pd.DataFrame(df.loc[:, 'X'], index=cdf.index, - columns=exp_columns) + exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y']) + expect = DataFrame(df.loc[:, 'X'], index=cdf.index, + columns=exp_columns) assert_frame_equal(cdf.loc[:, 'X'], expect) - expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AAB'))) + expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, + index=CategoricalIndex(list('AAB'))) assert_frame_equal(cdf.loc[['A', 'B'], :], expect) - expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=pd.CategoricalIndex(list('XXY'))) + expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, + columns=CategoricalIndex(list('XXY'))) assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) def test_read_only_source(self): @@ -281,13 +316,13 @@ def test_reindexing(self): # then return a Categorical cats = list('cabe') - result = self.df2.reindex(pd.Categorical(['a', 'd'], categories=cats)) + result = self.df2.reindex(Categorical(['a', 'd'], categories=cats)) expected = DataFrame({'A': [0, 1, 5, np.nan], 'B': Series(list('aaad')).astype( 'category', categories=cats)}).set_index('B') assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(pd.Categorical(['a'], categories=cats)) + result = self.df2.reindex(Categorical(['a'], categories=cats)) expected = DataFrame({'A': [0, 1, 5], 'B': Series(list('aaa')).astype( 'category', categories=cats)}).set_index('B') @@ -309,7 +344,7 @@ def test_reindexing(self): assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex(pd.Categorical( + result = self.df2.reindex(Categorical( ['a', 'd'], categories=cats, ordered=True)) expected = DataFrame( {'A': [0, 1, 5, np.nan], @@ -317,7 +352,7 @@ def test_reindexing(self): ordered=True)}).set_index('B') assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(pd.Categorical( + result = self.df2.reindex(Categorical( ['a', 'd'], categories=['a', 'd'])) expected = DataFrame({'A': [0, 1, 5, np.nan], 'B': Series(list('aaad')).astype( diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 8960ae610f8d78..f41c977cc03e10 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -490,8 +490,8 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x', 'cat_y']] assert_frame_equal(result, expected) - # GH12402 Add a new parameter `drop_first` to avoid collinearity def test_basic_drop_first(self): + # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list('abc') s_series = Series(s_list)