pandas-dev · jorisvandenbossche · Apr 30, 2018 · Apr 27, 2018 · Apr 27, 2018 · Apr 27, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -856,6 +856,7 @@ Other API Changes
 - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
 - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
 - A user-defined-function that is passed to :func:`Series.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, :func:`DataFrame.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here <whatsnew_0230.enhancements.window_raw>`. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
+- The previous default behavior of negative indices in ``Categorical.take`` is deprecated. In a future version it will change from meaning missing values to meaning positional indices from the right. The future behavior is consistent with :meth:`Series.take` (:issue:`20664`).
 
 .. _whatsnew_0230.deprecations:
 
@@ -1024,6 +1025,7 @@ Categorical
 - Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`)
 - Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`)
 - Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`)
+- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in `indicies` as missing value markers, rather than the last element of the Series (:issue:`20664`)
 
 Datetimelike
 ^^^^^^^^^^^^

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 from warnings import warn
+import textwrap
 import types
 
 from pandas import compat
@@ -29,7 +30,7 @@
     is_scalar,
     is_dict_like)
 
-from pandas.core.algorithms import factorize, take_1d, unique1d
+from pandas.core.algorithms import factorize, take_1d, unique1d, take
 from pandas.core.accessor import PandasDelegate
 from pandas.core.base import (PandasObject,
                               NoNewAttributesMixin, _shared_docs)
@@ -48,6 +49,17 @@
 from .base import ExtensionArray
 
 
+_take_msg = textwrap.dedent("""\
+    Interpreting negative values in 'indexer' as missing values.
+    In the future, this will change to meaning positional indicies
+    from the right.
+
+    Use 'allow_fill=True' to retain the previous behavior and silence this
+    warning.
+
+    Use 'allow_fill=False' to accept the new behavior.""")
+
+
 def _cat_compare_op(op):
     def f(self, other):
         # On python2, you can usually compare any type to any type, and
@@ -1732,17 +1744,49 @@ def fillna(self, value=None, method=None, limit=None):
         return self._constructor(values, categories=self.categories,
                                  ordered=self.ordered, fastpath=True)
 
-    def take_nd(self, indexer, allow_fill=True, fill_value=None):
-        """ Take the codes by the indexer, fill with the fill_value.
-
-        For internal compatibility with numpy arrays.
+    def take_nd(self, indexer, allow_fill=None, fill_value=None):
         """
+        Take elements from the Categorical.
 
-        # filling must always be None/nan here
-        # but is passed thru internally
-        assert isna(fill_value)
+        Parameters
+        ----------
+        indexer : sequence of integers
+        allow_fill : bool, default None.
+            How to handle negative values in `indexer`.
 
-        codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
+            * False: negative values in `indices` indicate positional indices
+              from the right. This is similar to
+              :func:`numpy.take`.
+
+            * True: negative values in `indices` indicate missing values
+              (the default). These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+
+            .. versionchanged:: 0.23.0
+
+               Deprecated the default value of `allow_fill`. The deprecated
+               default is ``True``. In the future, this will change to
+               ``False``.
+
+        Returns
+        -------
+        Categorical
+            This Categorical will have the same categories and ordered as
+            `self`.
+        """
+        indexer = np.asarray(indexer, dtype=np.intp)
+        if allow_fill is None:
+            if (indexer < 0).any():
+                warn(_take_msg, FutureWarning, stacklevel=2)
+                allow_fill = True
+
+        if isna(fill_value):
+            # For categorical, any NA value is considered a user-facing
+            # NA value. Our storage NA value is -1.
+            fill_value = -1
+
+        codes = take(self._codes, indexer, allow_fill=allow_fill,
+                     fill_value=fill_value)
         result = self._constructor(codes, categories=self.categories,
                                    ordered=self.ordered, fastpath=True)
         return result

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3499,7 +3499,14 @@ def _take(self, indices, axis=0, convert=True, is_copy=False):
 
         indices = _ensure_platform_int(indices)
         new_index = self.index.take(indices)
-        new_values = self._values.take(indices)
+
+        if is_categorical_dtype(self):
+            # https://github.com/pandas-dev/pandas/issues/20664
+            # TODO: remove when the default Categorical.take behavior changes
+            kwargs = {'allow_fill': False}
+        else:
+            kwargs = {}
+        new_values = self._values.take(indices, **kwargs)
 
         result = (self._constructor(new_values, index=new_index,
                                     fastpath=True).__finalize__(self))

diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py
@@ -5,6 +5,18 @@
 import pandas.util.testing as tm
 
 
+@pytest.fixture(params=[True, False])
+def allow_fill(request):
+    """Boolean 'allow_fill' parameter for Categorical.take"""
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def ordered(request):
+    """Boolean 'ordered' parameter for Categorical."""
+    return request.param
+
+
 @pytest.mark.parametrize('ordered', [True, False])
 @pytest.mark.parametrize('categories', [
     ['b', 'a', 'c'],
@@ -69,3 +81,45 @@ def test_isin_empty(empty):
 
     result = s.isin(empty)
     tm.assert_numpy_array_equal(expected, result)
+
+
+class TestTake(object):
+    # https://github.com/pandas-dev/pandas/issues/20664
+
+    def test_take_warns(self):
+        cat = pd.Categorical(['a', 'b'])
+        with tm.assert_produces_warning(FutureWarning):
+            cat.take([0, -1])
+
+    def test_take_positive_no_warning(self):
+        cat = pd.Categorical(['a', 'b'])
+        with tm.assert_produces_warning(None):
+            cat.take([0, 0])
+
+    def test_take_bounds(self, allow_fill):
+        # https://github.com/pandas-dev/pandas/issues/20664
+        cat = pd.Categorical(['a', 'b', 'a'])
+        with pytest.raises(IndexError):
+            cat.take([4, 5], allow_fill=allow_fill)
+
+    def test_take_empty(self, allow_fill):
+        # https://github.com/pandas-dev/pandas/issues/20664
+        cat = pd.Categorical([], categories=['a', 'b'])
+        with pytest.raises(IndexError):
+            cat.take([0], allow_fill=allow_fill)
+
+    def test_positional_take(self, ordered):
+        cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'],
+                             ordered=ordered)
+        result = cat.take([0, 1, 2], allow_fill=False)
+        expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories,
+                                  ordered=ordered)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_positional_take_unobserved(self, ordered):
+        cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'],
+                             ordered=ordered)
+        result = cat.take([1, 0], allow_fill=False)
+        expected = pd.Categorical(['b', 'a'], categories=cat.categories,
+                                  ordered=ordered)
+        tm.assert_categorical_equal(result, expected)
diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py
@@ -111,7 +111,7 @@ def test_take_non_na_fill_value(self):
     def test_take_out_of_bounds_raises(self):
         pass
 
-    @skip_take
+    @pytest.mark.skip(reason="GH-20747. Unobserved categories.")
     def test_take_series(self):
         pass
 

diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -32,7 +32,7 @@ def data():
     while len(data[0]) == len(data[1]):
         data = make_data()
 
-    return JSONArray(make_data())
+    return JSONArray(data)
 
 
 @pytest.fixture

diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
@@ -753,6 +753,16 @@ def test_take():
         s.take([-1, 3, 4], convert=False)
 
 
+def test_take_categorical():
+    # https://github.com/pandas-dev/pandas/issues/20664
+    s = Series(pd.Categorical(['a', 'b', 'c']))
+    result = s.take([-2, -2, 0])
+    expected = Series(pd.Categorical(['b', 'b', 'a'],
+                      categories=['a', 'b', 'c']),
+                      index=[1, 1, 0])
+    assert_series_equal(result, expected)
+
+
 def test_head_tail(test_data):
     assert_series_equal(test_data.series.head(), test_data.series[:5])
     assert_series_equal(test_data.series.head(0), test_data.series[0:0])