REGR: Bug in indexing with a CategoricalIndex (pandas-dev#16123)

* REGR: Bug in indexing with a CategoricalIndex closes pandas-dev#16115 * some cleaning * BUG: scalar getitem with a CI closes pandas-dev#16131
pcluo · May 22, 2017 · 331db44 · 331db44
1 parent 3234601
commit 331db44
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 55 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -1631,7 +1631,8 @@ Indexing
 - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
 - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
 - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)
- - Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
+- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
+- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`)
 
 I/O
 ^^^

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -10,7 +10,8 @@
     is_list_like,
     is_interval_dtype,
     is_scalar)
-from pandas.core.common import _asarray_tuplesafe
+from pandas.core.common import (_asarray_tuplesafe,
+                                _values_from_object)
 from pandas.core.dtypes.missing import array_equivalent
 from pandas.core.algorithms import take_1d
 
@@ -353,6 +354,22 @@ def get_loc(self, key, method=None):
             raise KeyError(key)
         return self._engine.get_loc(codes)
 
+    def get_value(self, series, key):
+        """
+        Fast lookup of value from 1-dimensional ndarray. Only use this if you
+        know what you're doing
+        """
+        try:
+            k = _values_from_object(key)
+            k = self._convert_scalar_indexer(k, kind='getitem')
+            indexer = self.get_loc(k)
+            return series.iloc[indexer]
+        except (KeyError, TypeError):
+            pass
+
+        # we might be a positional inexer
+        return super(CategoricalIndex, self).get_value(series, key)
+
     def _can_reindex(self, indexer):
         """ always allow reindexing """
         pass
@@ -507,7 +524,7 @@ def _convert_list_indexer(self, keyarr, kind=None):
             indexer = self.categories._convert_list_indexer(keyarr, kind=kind)
             return Index(self.codes).get_indexer_for(indexer)
 
-        indexer = self.categories.get_indexer(keyarr)
+        indexer = self.categories.get_indexer(np.asarray(keyarr))
         if (indexer == -1).any():
             raise KeyError(
                 "a list-indexer must only "

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -4,7 +4,8 @@
 
 import pandas as pd
 import numpy as np
-from pandas import Series, DataFrame
+from pandas import (Series, DataFrame, Timestamp,
+                    Categorical, CategoricalIndex)
 from pandas.util.testing import assert_series_equal, assert_frame_equal
 from pandas.util import testing as tm
 
@@ -66,6 +67,17 @@ def f():
 
         pytest.raises(TypeError, f)
 
+    def test_getitem_scalar(self):
+
+        cats = Categorical([Timestamp('12-31-1999'),
+                            Timestamp('12-31-2000')])
+
+        s = Series([1, 2], index=cats)
+
+        expected = s.iloc[0]
+        result = s[cats[0]]
+        assert result == expected
+
     def test_loc_listlike(self):
 
         # list of labels
@@ -74,7 +86,7 @@ def test_loc_listlike(self):
         assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.loc[['a', 'b', 'e']]
-        exp_index = pd.CategoricalIndex(
+        exp_index = CategoricalIndex(
             list('aaabbe'), categories=list('cabe'), name='B')
         expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
         assert_frame_equal(result, expected, check_index_type=True)
@@ -86,14 +98,14 @@ def test_loc_listlike(self):
         df = self.df2.copy()
         df.loc['e'] = 20
         result = df.loc[['a', 'b', 'e']]
-        exp_index = pd.CategoricalIndex(
+        exp_index = CategoricalIndex(
             list('aaabbe'), categories=list('cabe'), name='B')
         expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index)
         assert_frame_equal(result, expected)
 
         df = self.df2.copy()
         result = df.loc[['a', 'b', 'e']]
-        exp_index = pd.CategoricalIndex(
+        exp_index = CategoricalIndex(
             list('aaabbe'), categories=list('cabe'), name='B')
         expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
         assert_frame_equal(result, expected, check_index_type=True)
@@ -105,21 +117,21 @@ def test_loc_listlike_dtypes(self):
         # GH 11586
 
         # unique categories and codes
-        index = pd.CategoricalIndex(['a', 'b', 'c'])
+        index = CategoricalIndex(['a', 'b', 'c'])
         df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
 
         # unique slice
         res = df.loc[['a', 'b']]
-        exp_index = pd.CategoricalIndex(['a', 'b'],
-                                        categories=index.categories)
+        exp_index = CategoricalIndex(['a', 'b'],
+                                     categories=index.categories)
         exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         # duplicated slice
         res = df.loc[['a', 'a', 'b']]
 
-        exp_index = pd.CategoricalIndex(['a', 'a', 'b'],
-                                        categories=index.categories)
+        exp_index = CategoricalIndex(['a', 'a', 'b'],
+                                     categories=index.categories)
         exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
@@ -130,22 +142,22 @@ def test_loc_listlike_dtypes(self):
             df.loc[['a', 'x']]
 
         # duplicated categories and codes
-        index = pd.CategoricalIndex(['a', 'b', 'a'])
+        index = CategoricalIndex(['a', 'b', 'a'])
         df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
 
         # unique slice
         res = df.loc[['a', 'b']]
         exp = DataFrame({'A': [1, 3, 2],
                          'B': [4, 6, 5]},
-                        index=pd.CategoricalIndex(['a', 'a', 'b']))
+                        index=CategoricalIndex(['a', 'a', 'b']))
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         # duplicated slice
         res = df.loc[['a', 'a', 'b']]
         exp = DataFrame(
             {'A': [1, 3, 1, 3, 2],
              'B': [4, 6, 4, 6, 5
-                   ]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
+                   ]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         with tm.assertRaisesRegexp(
@@ -155,27 +167,27 @@ def test_loc_listlike_dtypes(self):
             df.loc[['a', 'x']]
 
         # contains unused category
-        index = pd.CategoricalIndex(
+        index = CategoricalIndex(
             ['a', 'b', 'a', 'c'], categories=list('abcde'))
         df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)
 
         res = df.loc[['a', 'b']]
-        exp = DataFrame({'A': [1, 3, 2],
-                         'B': [5, 7, 6]}, index=pd.CategoricalIndex(
-                             ['a', 'a', 'b'], categories=list('abcde')))
+        exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
+                        index=CategoricalIndex(['a', 'a', 'b'],
+                                               categories=list('abcde')))
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         res = df.loc[['a', 'e']]
         exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
-                        index=pd.CategoricalIndex(['a', 'a', 'e'],
-                                                  categories=list('abcde')))
+                        index=CategoricalIndex(['a', 'a', 'e'],
+                                               categories=list('abcde')))
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         # duplicated slice
         res = df.loc[['a', 'a', 'b']]
         exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
-                        index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
-                                                  categories=list('abcde')))
+                        index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
+                                               categories=list('abcde')))
         tm.assert_frame_equal(res, exp, check_index_type=True)
 
         with tm.assertRaisesRegexp(
@@ -184,54 +196,77 @@ def test_loc_listlike_dtypes(self):
                 'that are in the categories'):
             df.loc[['a', 'x']]
 
+    def test_get_indexer_array(self):
+        arr = np.array([Timestamp('1999-12-31 00:00:00'),
+                        Timestamp('2000-12-31 00:00:00')], dtype=object)
+        cats = [Timestamp('1999-12-31 00:00:00'),
+                Timestamp('2000-12-31 00:00:00')]
+        ci = CategoricalIndex(cats,
+                              categories=cats,
+                              ordered=False, dtype='category')
+        result = ci.get_indexer(arr)
+        expected = np.array([0, 1], dtype='intp')
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_getitem_with_listlike(self):
+        # GH 16115
+        cats = Categorical([Timestamp('12-31-1999'),
+                            Timestamp('12-31-2000')])
+
+        expected = DataFrame([[1, 0], [0, 1]], dtype='uint8',
+                             index=[0, 1], columns=cats)
+        dummies = pd.get_dummies(cats)
+        result = dummies[[c for c in dummies.columns]]
+        assert_frame_equal(result, expected)
+
     def test_ix_categorical_index(self):
         # GH 12531
-        df = pd.DataFrame(np.random.randn(3, 3),
-                          index=list('ABC'), columns=list('XYZ'))
+        df = DataFrame(np.random.randn(3, 3),
+                       index=list('ABC'), columns=list('XYZ'))
         cdf = df.copy()
-        cdf.index = pd.CategoricalIndex(df.index)
-        cdf.columns = pd.CategoricalIndex(df.columns)
+        cdf.index = CategoricalIndex(df.index)
+        cdf.columns = CategoricalIndex(df.columns)
 
-        expect = pd.Series(df.loc['A', :], index=cdf.columns, name='A')
+        expect = Series(df.loc['A', :], index=cdf.columns, name='A')
         assert_series_equal(cdf.loc['A', :], expect)
 
-        expect = pd.Series(df.loc[:, 'X'], index=cdf.index, name='X')
+        expect = Series(df.loc[:, 'X'], index=cdf.index, name='X')
         assert_series_equal(cdf.loc[:, 'X'], expect)
 
-        exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
-        expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
-                              index=exp_index)
+        exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
+        expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
+                           index=exp_index)
         assert_frame_equal(cdf.loc[['A', 'B'], :], expect)
 
-        exp_columns = pd.CategoricalIndex(list('XY'),
-                                          categories=['X', 'Y', 'Z'])
-        expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
-                              columns=exp_columns)
+        exp_columns = CategoricalIndex(list('XY'),
+                                       categories=['X', 'Y', 'Z'])
+        expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
+                           columns=exp_columns)
         assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)
 
         # non-unique
-        df = pd.DataFrame(np.random.randn(3, 3),
-                          index=list('ABA'), columns=list('XYX'))
+        df = DataFrame(np.random.randn(3, 3),
+                       index=list('ABA'), columns=list('XYX'))
         cdf = df.copy()
-        cdf.index = pd.CategoricalIndex(df.index)
-        cdf.columns = pd.CategoricalIndex(df.columns)
+        cdf.index = CategoricalIndex(df.index)
+        cdf.columns = CategoricalIndex(df.columns)
 
-        exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B'])
-        expect = pd.DataFrame(df.loc['A', :], columns=cdf.columns,
-                              index=exp_index)
+        exp_index = CategoricalIndex(list('AA'), categories=['A', 'B'])
+        expect = DataFrame(df.loc['A', :], columns=cdf.columns,
+                           index=exp_index)
         assert_frame_equal(cdf.loc['A', :], expect)
 
-        exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y'])
-        expect = pd.DataFrame(df.loc[:, 'X'], index=cdf.index,
-                              columns=exp_columns)
+        exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y'])
+        expect = DataFrame(df.loc[:, 'X'], index=cdf.index,
+                           columns=exp_columns)
         assert_frame_equal(cdf.loc[:, 'X'], expect)
 
-        expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
-                              index=pd.CategoricalIndex(list('AAB')))
+        expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
+                           index=CategoricalIndex(list('AAB')))
         assert_frame_equal(cdf.loc[['A', 'B'], :], expect)
 
-        expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
-                              columns=pd.CategoricalIndex(list('XXY')))
+        expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
+                           columns=CategoricalIndex(list('XXY')))
         assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)
 
     def test_read_only_source(self):
@@ -281,13 +316,13 @@ def test_reindexing(self):
         # then return a Categorical
         cats = list('cabe')
 
-        result = self.df2.reindex(pd.Categorical(['a', 'd'], categories=cats))
+        result = self.df2.reindex(Categorical(['a', 'd'], categories=cats))
         expected = DataFrame({'A': [0, 1, 5, np.nan],
                               'B': Series(list('aaad')).astype(
                                   'category', categories=cats)}).set_index('B')
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(pd.Categorical(['a'], categories=cats))
+        result = self.df2.reindex(Categorical(['a'], categories=cats))
         expected = DataFrame({'A': [0, 1, 5],
                               'B': Series(list('aaa')).astype(
                                   'category', categories=cats)}).set_index('B')
@@ -309,15 +344,15 @@ def test_reindexing(self):
         assert_frame_equal(result, expected, check_index_type=True)
 
         # give back the type of categorical that we received
-        result = self.df2.reindex(pd.Categorical(
+        result = self.df2.reindex(Categorical(
             ['a', 'd'], categories=cats, ordered=True))
         expected = DataFrame(
             {'A': [0, 1, 5, np.nan],
              'B': Series(list('aaad')).astype('category', categories=cats,
                                               ordered=True)}).set_index('B')
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(pd.Categorical(
+        result = self.df2.reindex(Categorical(
             ['a', 'd'], categories=['a', 'd']))
         expected = DataFrame({'A': [0, 1, 5, np.nan],
                               'B': Series(list('aaad')).astype(

diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py
@@ -490,8 +490,8 @@ def test_dataframe_dummies_with_categorical(self):
                              'cat_x', 'cat_y']]
         assert_frame_equal(result, expected)
 
-    # GH12402 Add a new parameter `drop_first` to avoid collinearity
     def test_basic_drop_first(self):
+        # GH12402 Add a new parameter `drop_first` to avoid collinearity
         # Basic case
         s_list = list('abc')
         s_series = Series(s_list)