deprecate categories and ordered parameters

pandas-dev · Jan 6, 2019 · a4cf7a2 · a4cf7a2
1 parent 2e8f46f
commit a4cf7a2
Show file tree

Hide file tree

Showing 16 changed files with 138 additions and 155 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1234,6 +1234,7 @@ Deprecations
 - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
 - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`)
 - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)
+- :meth:`Categorical.from_codes` has deprecated parameters ``categories`` and ``ordered``. Supply a :class:`~pandas.api.types.CategoricalDtype` to new parameter ``dtype`` instead. (:issue:`24398`)
 - :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`)
 - :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain
   many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -605,9 +605,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
     @classmethod
     def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
         """
-        Make a Categorical type from codes and categories arrays.
+        Make a Categorical type from codes and CategoricalDtype.
 
-        This constructor is useful if you already have codes and categories and
+        This constructor is useful if you already have codes and the dtype and
         so do not need the (computation intensive) factorization step, which is
         usually done on the constructor.
 
@@ -621,19 +621,21 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
             categories or -1 for NaN
         categories : index-like, optional
             The categories for the categorical. Items need to be unique.
+
+            .. deprecated:: 0.24.0
+                Use ``dtype`` instead.
         ordered : bool, optional
             Whether or not this categorical is treated as an ordered
             categorical. If not given, the resulting categorical will be
             unordered.
 
-            .. versionchanged:: 0.24.0
-
-                The default value has been changed to  ``None``. Previously
-                the default value was ``False``.
-        dtype : CategoricalDtype, optional
+            .. deprecated:: 0.24.0
+                Use ``dtype`` instead.
+        dtype : CategoricalDtype
             An instance of ``CategoricalDtype`` to use for this categorical.
 
             .. versionadded:: 0.24.0
+                dtype will be required in the future.
 
         Examples
         --------
@@ -642,8 +644,18 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
         [a, b, a, b]
         Categories (2, object): [a < b]
         """
-        dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
-                                                       ordered, dtype)
+        if dtype is not None:
+            if categories is not None or ordered is not None:
+                raise ValueError("Cannot specify `categories` or `ordered` "
+                                 "together with `dtype`.")
+        elif categories is None and dtype is None:
+            raise ValueError("Must specify `dtype`.")
+        else:
+            msg = u("The 'categories' and 'ordered' keyword are deprecated "
+                    "and will be removed in a future version. Please use "
+                    "'dtype' instead.")
+            warn(msg, FutureWarning, stacklevel=2)
+            dtype = CategoricalDtype(categories, ordered)
 
         codes = np.asarray(codes)  # #21767
         if not is_integer_dtype(codes):
@@ -1211,9 +1223,8 @@ def map(self, mapper):
         """
         new_categories = self.categories.map(mapper)
         try:
-            return self.from_codes(self._codes.copy(),
-                                   categories=new_categories,
-                                   ordered=self.ordered)
+            new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
+            return self.from_codes(self._codes.copy(), dtype=new_dtype)
         except ValueError:
             # NA values are represented in self._codes with -1
             # np.take causes NA values to take final element in new_categories

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -14,6 +14,7 @@
 from pandas.core.dtypes.common import (
     ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable,
     is_list_like, is_scalar, is_timedelta64_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.generic import ABCSeries
 
 import pandas.core.algorithms as algorithms
@@ -292,21 +293,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 from pandas.core.groupby.categorical import recode_for_groupby
                 self.grouper, self.all_grouper = recode_for_groupby(
                     self.grouper, self.sort, observed)
-                categories = self.grouper.categories
+                dtype = CategoricalDtype(self.grouper.categories,
+                                         ordered=self.grouper.ordered)
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
                 self._labels = self.grouper.codes
                 if observed:
                     codes = algorithms.unique1d(self.grouper.codes)
                 else:
-                    codes = np.arange(len(categories))
+                    codes = np.arange(len(dtype.categories))
 
                 self._group_index = CategoricalIndex(
-                    Categorical.from_codes(
-                        codes=codes,
-                        categories=categories,
-                        ordered=self.grouper.ordered))
+                    Categorical.from_codes(codes=codes, dtype=dtype))
 
             # we are done
             if isinstance(self.grouper, Grouping):
@@ -395,8 +394,8 @@ def _make_labels(self):
 
     @cache_readonly
     def groups(self):
-        return self.index.groupby(Categorical.from_codes(self.labels,
-                                                         self.group_index))
+        return self.index.groupby(
+            Categorical(self.labels, self.group_index, fastpath=True))
 
 
 def _get_grouper(obj, key=None, axis=0, level=None, sort=True,

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -18,7 +18,8 @@
     ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
     is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
     pandas_dtype)
-from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype, ExtensionDtype, PandasExtensionDtype)
 from pandas.core.dtypes.generic import ABCDataFrame
 from pandas.core.dtypes.missing import array_equivalent, isna
 
@@ -2026,13 +2027,14 @@ def _get_codes_for_sorting(self):
         """
         from pandas.core.arrays import Categorical
 
-        def cats(level_codes):
-            return np.arange(np.array(level_codes).max() + 1 if
+        def as_dtype(level_codes):
+            cats = np.arange(np.array(level_codes).max() + 1 if
                              len(level_codes) else 0,
                              dtype=level_codes.dtype)
+            return CategoricalDtype(cats, ordered=True)
 
-        return [Categorical.from_codes(level_codes, cats(level_codes),
-                                       ordered=True)
+        return [Categorical.from_codes(level_codes,
+                                       dtype=as_dtype(level_codes))
                 for level_codes in self.codes]
 
     def sortlevel(self, level=0, ascending=True, sort_remaining=True):

diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -55,6 +55,7 @@
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
     needs_i8_conversion, pandas_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
 
 from pandas import (  # noqa:F401
     Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
@@ -621,9 +622,8 @@ def decode(obj):
                                                     name=obj[u'name'])
     elif typ == u'category':
         from_codes = globals()[obj[u'klass']].from_codes
-        return from_codes(codes=obj[u'codes'],
-                          categories=obj[u'categories'],
-                          ordered=obj[u'ordered'])
+        dtype = CDT(obj[u'categories'], ordered=obj[u'ordered'])
+        return from_codes(codes=obj[u'codes'], dtype=dtype)
 
     elif typ == u'interval':
         return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -24,6 +24,7 @@
     ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype,
     is_datetime64_dtype, is_datetime64tz_dtype, is_list_like,
     is_timedelta64_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import array_equivalent
 
 from pandas import (
@@ -2206,10 +2207,8 @@ def convert(self, values, nan_rep, encoding, errors):
                         categories = categories[~mask]
                         codes[codes != -1] -= mask.astype(int).cumsum().values
 
-                self.data = Categorical.from_codes(codes,
-                                                   categories=categories,
-                                                   ordered=self.ordered)
-
+                dtype = CategoricalDtype(categories, ordered=self.ordered)
+                self.data = Categorical.from_codes(codes, dtype=dtype)
             else:
 
                 try:

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -21,18 +21,13 @@ class TestCategoricalConstructors(object):
     def test_validate_ordered(self):
         # see gh-14058
         exp_msg = "'ordered' must either be 'True' or 'False'"
-        exp_err = TypeError
 
-        # This should be a boolean.
+        # This should be a boolean or None.
         ordered = np.array([0, 1, 2])
 
-        with pytest.raises(exp_err, match=exp_msg):
+        with pytest.raises(TypeError, match=exp_msg):
             Categorical([1, 2, 3], ordered=ordered)
 
-        with pytest.raises(exp_err, match=exp_msg):
-            Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'],
-                                   ordered=ordered)
-
     def test_constructor_empty(self):
         # GH 17248
         c = Categorical([])
@@ -421,76 +416,41 @@ def test_constructor_with_categorical_categories(self):
         tm.assert_categorical_equal(result, expected)
 
     def test_from_codes(self):
+        dtype = CategoricalDtype(categories=[1, 2])
+
+        # no dtype or categories
+        msg = 'Must specify `dtype`.'
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([1, 2])
 
         # too few categories
-        dtype = CategoricalDtype(categories=[1, 2])
         msg = "codes need to be between "
-        with pytest.raises(ValueError, match=msg):
-            Categorical.from_codes([1, 2], categories=dtype.categories)
         with pytest.raises(ValueError, match=msg):
             Categorical.from_codes([1, 2], dtype=dtype)
 
         # no int codes
         msg = "codes need to be array-like integers"
-        with pytest.raises(ValueError, match=msg):
-            Categorical.from_codes(["a"], categories=dtype.categories)
         with pytest.raises(ValueError, match=msg):
             Categorical.from_codes(["a"], dtype=dtype)
 
-        # no unique categories
-        with pytest.raises(ValueError,
-                           match="Categorical categories must be unique"):
-            Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
-
-        # NaN categories included
-        with pytest.raises(ValueError,
-                           match="Categorial categories cannot be null"):
-            Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
-
         # too negative
         dtype = CategoricalDtype(categories=["a", "b", "c"])
         msg = r"codes need to be between -1 and len\(categories\)-1"
-        with pytest.raises(ValueError, match=msg):
-            Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
         with pytest.raises(ValueError, match=msg):
             Categorical.from_codes([-2, 1, 2], dtype=dtype)
 
         exp = Categorical(["a", "b", "c"], ordered=False)
-        res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
-        tm.assert_categorical_equal(exp, res)
-
         res = Categorical.from_codes([0, 1, 2], dtype=dtype)
         tm.assert_categorical_equal(exp, res)
 
         codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
         dtype = CategoricalDtype(categories=["train", "test"])
-        Categorical.from_codes(codes, categories=dtype.categories)
         Categorical.from_codes(codes, dtype=dtype)
 
-    def test_from_codes_with_categorical_categories(self):
-        # GH17884
-        expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
-
-        result = Categorical.from_codes(
-            [0, 1], categories=Categorical(['a', 'b', 'c']))
-        tm.assert_categorical_equal(result, expected)
-
-        result = Categorical.from_codes(
-            [0, 1], categories=CategoricalIndex(['a', 'b', 'c']))
-        tm.assert_categorical_equal(result, expected)
-
-        # non-unique Categorical still raises
-        with pytest.raises(ValueError,
-                           match="Categorical categories must be unique"):
-            Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
-
     def test_from_codes_with_nan_code(self):
         # GH21767
         codes = [1, 2, np.nan]
         dtype = CategoricalDtype(categories=['a', 'b', 'c'])
-        with pytest.raises(ValueError,
-                           match="codes need to be array-like integers"):
-            Categorical.from_codes(codes, categories=dtype.categories)
         with pytest.raises(ValueError,
                            match="codes need to be array-like integers"):
             Categorical.from_codes(codes, dtype=dtype)
@@ -500,36 +460,43 @@ def test_from_codes_with_float(self):
         codes = [1.0, 2.0, 0]  # integer, but in float dtype
         dtype = CategoricalDtype(categories=['a', 'b', 'c'])
 
-        with tm.assert_produces_warning(FutureWarning):
-            cat = Categorical.from_codes(codes, dtype.categories)
-        tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
-
-        with tm.assert_produces_warning(FutureWarning):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
             cat = Categorical.from_codes(codes, dtype=dtype)
         tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
 
         codes = [1.1, 2.0, 0]  # non-integer
-        with pytest.raises(ValueError,
-                           match="codes need to be array-like integers"):
-            Categorical.from_codes(codes, dtype.categories)
         with pytest.raises(ValueError,
                            match="codes need to be array-like integers"):
             Categorical.from_codes(codes, dtype=dtype)
 
+    def test_from_codes_deprecated(self, ordered):
+        # GH24398
+        cats = ['a', 'b']
+        with tm.assert_produces_warning(FutureWarning):
+            Categorical.from_codes([0, 1], categories=cats)
+
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            Categorical.from_codes([0, 1], categories=cats, ordered=True)
+
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            Categorical.from_codes([0, 1], categories=cats, ordered=False)
+
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories(self, dtype):
         cats = ['a', 'b']
         codes = np.array([0, 0, 1, 1], dtype='i8')
         result = Categorical._from_inferred_categories(cats, codes, dtype)
-        expected = Categorical.from_codes(codes, cats)
+        expected = Categorical.from_codes(codes,
+                                          dtype=CategoricalDtype(cats))
         tm.assert_categorical_equal(result, expected)
 
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories_sorts(self, dtype):
         cats = ['b', 'a']
         codes = np.array([0, 1, 1, 1], dtype='i8')
         result = Categorical._from_inferred_categories(cats, codes, dtype)
-        expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
+        expected = Categorical.from_codes([1, 0, 0, 0],
+                                          dtype=CategoricalDtype(['a', 'b']))
         tm.assert_categorical_equal(result, expected)
 
     def test_from_inferred_categories_dtype(self):

diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py
@@ -1,25 +1,29 @@
 # -*- coding: utf-8 -*-
 
 from pandas import Categorical
+from pandas.api.types import CategoricalDtype
 import pandas.util.testing as tm
 
 
 class TestCategoricalSubclassing(object):
 
     def test_constructor(self):
-        sc = tm.SubclassedCategorical(['a', 'b', 'c'])
-        assert isinstance(sc, tm.SubclassedCategorical)
-        tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
+        subclassed = tm.SubclassedCategorical(['a', 'b', 'c'])
+        assert isinstance(subclassed, tm.SubclassedCategorical)
+        tm.assert_categorical_equal(subclassed, Categorical(['a', 'b', 'c']))
 
     def test_from_codes(self):
-        sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
-        assert isinstance(sc, tm.SubclassedCategorical)
-        exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
-        tm.assert_categorical_equal(sc, exp)
+        dtype = CategoricalDtype(['a', 'b', 'c'])
+        subclassed = tm.SubclassedCategorical.from_codes([1, 0, 2],
+                                                         dtype=dtype)
+        assert isinstance(subclassed, tm.SubclassedCategorical)
+
+        expected = Categorical.from_codes([1, 0, 2], dtype=dtype)
+        tm.assert_categorical_equal(subclassed, expected)
 
     def test_map(self):
-        sc = tm.SubclassedCategorical(['a', 'b', 'c'])
-        res = sc.map(lambda x: x.upper())
-        assert isinstance(res, tm.SubclassedCategorical)
-        exp = Categorical(['A', 'B', 'C'])
-        tm.assert_categorical_equal(res, exp)
+        subclassed = tm.SubclassedCategorical(['a', 'b', 'c'])
+        result = subclassed.map(lambda x: x.upper())
+        assert isinstance(result, tm.SubclassedCategorical)
+        expected = Categorical(['A', 'B', 'C'])
+        tm.assert_categorical_equal(result, expected)