Skip to content

Commit

Permalink
REGR: Bug in indexing with a CategoricalIndex (pandas-dev#16123)
Browse files Browse the repository at this point in the history
* REGR: Bug in indexing with a CategoricalIndex

closes pandas-dev#16115

* some cleaning

* BUG: scalar getitem with a CI

closes pandas-dev#16131
  • Loading branch information
jreback authored and pcluo committed May 22, 2017
1 parent 3234601 commit 331db44
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 55 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1631,7 +1631,8 @@ Indexing
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)
- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`)

I/O
^^^
Expand Down
21 changes: 19 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
is_list_like,
is_interval_dtype,
is_scalar)
from pandas.core.common import _asarray_tuplesafe
from pandas.core.common import (_asarray_tuplesafe,
_values_from_object)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core.algorithms import take_1d

Expand Down Expand Up @@ -353,6 +354,22 @@ def get_loc(self, key, method=None):
raise KeyError(key)
return self._engine.get_loc(codes)

def get_value(self, series, key):
"""
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing
"""
try:
k = _values_from_object(key)
k = self._convert_scalar_indexer(k, kind='getitem')
indexer = self.get_loc(k)
return series.iloc[indexer]
except (KeyError, TypeError):
pass

# we might be a positional inexer
return super(CategoricalIndex, self).get_value(series, key)

def _can_reindex(self, indexer):
""" always allow reindexing """
pass
Expand Down Expand Up @@ -507,7 +524,7 @@ def _convert_list_indexer(self, keyarr, kind=None):
indexer = self.categories._convert_list_indexer(keyarr, kind=kind)
return Index(self.codes).get_indexer_for(indexer)

indexer = self.categories.get_indexer(keyarr)
indexer = self.categories.get_indexer(np.asarray(keyarr))
if (indexer == -1).any():
raise KeyError(
"a list-indexer must only "
Expand Down
137 changes: 86 additions & 51 deletions pandas/tests/indexing/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from pandas import (Series, DataFrame, Timestamp,
Categorical, CategoricalIndex)
from pandas.util.testing import assert_series_equal, assert_frame_equal
from pandas.util import testing as tm

Expand Down Expand Up @@ -66,6 +67,17 @@ def f():

pytest.raises(TypeError, f)

def test_getitem_scalar(self):

cats = Categorical([Timestamp('12-31-1999'),
Timestamp('12-31-2000')])

s = Series([1, 2], index=cats)

expected = s.iloc[0]
result = s[cats[0]]
assert result == expected

def test_loc_listlike(self):

# list of labels
Expand All @@ -74,7 +86,7 @@ def test_loc_listlike(self):
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
assert_frame_equal(result, expected, check_index_type=True)
Expand All @@ -86,14 +98,14 @@ def test_loc_listlike(self):
df = self.df2.copy()
df.loc['e'] = 20
result = df.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index)
assert_frame_equal(result, expected)

df = self.df2.copy()
result = df.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
assert_frame_equal(result, expected, check_index_type=True)
Expand All @@ -105,21 +117,21 @@ def test_loc_listlike_dtypes(self):
# GH 11586

# unique categories and codes
index = pd.CategoricalIndex(['a', 'b', 'c'])
index = CategoricalIndex(['a', 'b', 'c'])
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

# unique slice
res = df.loc[['a', 'b']]
exp_index = pd.CategoricalIndex(['a', 'b'],
categories=index.categories)
exp_index = CategoricalIndex(['a', 'b'],
categories=index.categories)
exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]

exp_index = pd.CategoricalIndex(['a', 'a', 'b'],
categories=index.categories)
exp_index = CategoricalIndex(['a', 'a', 'b'],
categories=index.categories)
exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)

Expand All @@ -130,22 +142,22 @@ def test_loc_listlike_dtypes(self):
df.loc[['a', 'x']]

# duplicated categories and codes
index = pd.CategoricalIndex(['a', 'b', 'a'])
index = CategoricalIndex(['a', 'b', 'a'])
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

# unique slice
res = df.loc[['a', 'b']]
exp = DataFrame({'A': [1, 3, 2],
'B': [4, 6, 5]},
index=pd.CategoricalIndex(['a', 'a', 'b']))
index=CategoricalIndex(['a', 'a', 'b']))
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]
exp = DataFrame(
{'A': [1, 3, 1, 3, 2],
'B': [4, 6, 4, 6, 5
]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
tm.assert_frame_equal(res, exp, check_index_type=True)

with tm.assertRaisesRegexp(
Expand All @@ -155,27 +167,27 @@ def test_loc_listlike_dtypes(self):
df.loc[['a', 'x']]

# contains unused category
index = pd.CategoricalIndex(
index = CategoricalIndex(
['a', 'b', 'a', 'c'], categories=list('abcde'))
df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)

res = df.loc[['a', 'b']]
exp = DataFrame({'A': [1, 3, 2],
'B': [5, 7, 6]}, index=pd.CategoricalIndex(
['a', 'a', 'b'], categories=list('abcde')))
exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
index=CategoricalIndex(['a', 'a', 'b'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

res = df.loc[['a', 'e']]
exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
index=pd.CategoricalIndex(['a', 'a', 'e'],
categories=list('abcde')))
index=CategoricalIndex(['a', 'a', 'e'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]
exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
categories=list('abcde')))
index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

with tm.assertRaisesRegexp(
Expand All @@ -184,54 +196,77 @@ def test_loc_listlike_dtypes(self):
'that are in the categories'):
df.loc[['a', 'x']]

def test_get_indexer_array(self):
arr = np.array([Timestamp('1999-12-31 00:00:00'),
Timestamp('2000-12-31 00:00:00')], dtype=object)
cats = [Timestamp('1999-12-31 00:00:00'),
Timestamp('2000-12-31 00:00:00')]
ci = CategoricalIndex(cats,
categories=cats,
ordered=False, dtype='category')
result = ci.get_indexer(arr)
expected = np.array([0, 1], dtype='intp')
tm.assert_numpy_array_equal(result, expected)

def test_getitem_with_listlike(self):
# GH 16115
cats = Categorical([Timestamp('12-31-1999'),
Timestamp('12-31-2000')])

expected = DataFrame([[1, 0], [0, 1]], dtype='uint8',
index=[0, 1], columns=cats)
dummies = pd.get_dummies(cats)
result = dummies[[c for c in dummies.columns]]
assert_frame_equal(result, expected)

def test_ix_categorical_index(self):
# GH 12531
df = pd.DataFrame(np.random.randn(3, 3),
index=list('ABC'), columns=list('XYZ'))
df = DataFrame(np.random.randn(3, 3),
index=list('ABC'), columns=list('XYZ'))
cdf = df.copy()
cdf.index = pd.CategoricalIndex(df.index)
cdf.columns = pd.CategoricalIndex(df.columns)
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)

expect = pd.Series(df.loc['A', :], index=cdf.columns, name='A')
expect = Series(df.loc['A', :], index=cdf.columns, name='A')
assert_series_equal(cdf.loc['A', :], expect)

expect = pd.Series(df.loc[:, 'X'], index=cdf.index, name='X')
expect = Series(df.loc[:, 'X'], index=cdf.index, name='X')
assert_series_equal(cdf.loc[:, 'X'], expect)

exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=exp_index)
exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=exp_index)
assert_frame_equal(cdf.loc[['A', 'B'], :], expect)

exp_columns = pd.CategoricalIndex(list('XY'),
categories=['X', 'Y', 'Z'])
expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=exp_columns)
exp_columns = CategoricalIndex(list('XY'),
categories=['X', 'Y', 'Z'])
expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=exp_columns)
assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)

# non-unique
df = pd.DataFrame(np.random.randn(3, 3),
index=list('ABA'), columns=list('XYX'))
df = DataFrame(np.random.randn(3, 3),
index=list('ABA'), columns=list('XYX'))
cdf = df.copy()
cdf.index = pd.CategoricalIndex(df.index)
cdf.columns = pd.CategoricalIndex(df.columns)
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)

exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B'])
expect = pd.DataFrame(df.loc['A', :], columns=cdf.columns,
index=exp_index)
exp_index = CategoricalIndex(list('AA'), categories=['A', 'B'])
expect = DataFrame(df.loc['A', :], columns=cdf.columns,
index=exp_index)
assert_frame_equal(cdf.loc['A', :], expect)

exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y'])
expect = pd.DataFrame(df.loc[:, 'X'], index=cdf.index,
columns=exp_columns)
exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y'])
expect = DataFrame(df.loc[:, 'X'], index=cdf.index,
columns=exp_columns)
assert_frame_equal(cdf.loc[:, 'X'], expect)

expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=pd.CategoricalIndex(list('AAB')))
expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=CategoricalIndex(list('AAB')))
assert_frame_equal(cdf.loc[['A', 'B'], :], expect)

expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=pd.CategoricalIndex(list('XXY')))
expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=CategoricalIndex(list('XXY')))
assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)

def test_read_only_source(self):
Expand Down Expand Up @@ -281,13 +316,13 @@ def test_reindexing(self):
# then return a Categorical
cats = list('cabe')

result = self.df2.reindex(pd.Categorical(['a', 'd'], categories=cats))
result = self.df2.reindex(Categorical(['a', 'd'], categories=cats))
expected = DataFrame({'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype(
'category', categories=cats)}).set_index('B')
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(pd.Categorical(['a'], categories=cats))
result = self.df2.reindex(Categorical(['a'], categories=cats))
expected = DataFrame({'A': [0, 1, 5],
'B': Series(list('aaa')).astype(
'category', categories=cats)}).set_index('B')
Expand All @@ -309,15 +344,15 @@ def test_reindexing(self):
assert_frame_equal(result, expected, check_index_type=True)

# give back the type of categorical that we received
result = self.df2.reindex(pd.Categorical(
result = self.df2.reindex(Categorical(
['a', 'd'], categories=cats, ordered=True))
expected = DataFrame(
{'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype('category', categories=cats,
ordered=True)}).set_index('B')
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(pd.Categorical(
result = self.df2.reindex(Categorical(
['a', 'd'], categories=['a', 'd']))
expected = DataFrame({'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,8 @@ def test_dataframe_dummies_with_categorical(self):
'cat_x', 'cat_y']]
assert_frame_equal(result, expected)

# GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_basic_drop_first(self):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
s_list = list('abc')
s_series = Series(s_list)
Expand Down

0 comments on commit 331db44

Please sign in to comment.