Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Add dtype parameter to Categorical.from_codes #24398

Merged
merged 10 commits into from Jan 8, 2019

Revert "deprecate categories and ordered parameters"

This reverts commit 6cf8203.
  • Loading branch information...
topper-123 committed Jan 8, 2019
commit fccb54df64ec95af04c680e61127d95c7fcb2304
@@ -1283,7 +1283,6 @@ Deprecations
- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`)
- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)
- :meth:`Categorical.from_codes` has deprecated parameters ``categories`` and ``ordered``. Supply a :class:`~pandas.api.types.CategoricalDtype` to new parameter ``dtype`` instead. (:issue:`24398`)
- :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`)
- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain
many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)
@@ -605,9 +605,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
@classmethod
def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
This conversation was marked as resolved by jreback

This comment has been minimized.

Copy link
@jreback

jreback Dec 23, 2018

Contributor

I would rather deprecate categories & ordered here in favor of dtype

This comment has been minimized.

Copy link
@topper-123

topper-123 Dec 23, 2018

Author Contributor

Yeah, ok, I've updated.

"""
Make a Categorical type from codes and CategoricalDtype.
Make a Categorical type from codes and categories arrays.
This constructor is useful if you already have codes and the dtype and
This constructor is useful if you already have codes and categories and
so do not need the (computation intensive) factorization step, which is
usually done on the constructor.
@@ -621,21 +621,19 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
categories or -1 for NaN
categories : index-like, optional
The categories for the categorical. Items need to be unique.
.. deprecated:: 0.24.0
Use ``dtype`` instead.
ordered : bool, optional
Whether or not this categorical is treated as an ordered
categorical. If not given, the resulting categorical will be
unordered.
.. deprecated:: 0.24.0
Use ``dtype`` instead.
dtype : CategoricalDtype
.. versionchanged:: 0.24.0
The default value has been changed to ``None``. Previously
This conversation was marked as resolved by TomAugspurger

This comment has been minimized.

Copy link
@TomAugspurger

TomAugspurger Jan 8, 2019

Contributor

I don't think this is necessary. The default value of "None" ends up being False, right? So there's no real change in behavior.

the default value was ``False``.
dtype : CategoricalDtype, optional
An instance of ``CategoricalDtype`` to use for this categorical.
.. versionadded:: 0.24.0
dtype will be required in the future.
Examples
--------
@@ -644,18 +642,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
[a, b, a, b]
Categories (2, object): [a < b]
"""
if dtype is not None:
if categories is not None or ordered is not None:
raise ValueError("Cannot specify `categories` or `ordered` "
"together with `dtype`.")
elif categories is None and dtype is None:
raise ValueError("Must specify `dtype`.")
else:
msg = u("The 'categories' and 'ordered' keyword are deprecated "
"and will be removed in a future version. Please use "
"'dtype' instead.")
warn(msg, FutureWarning, stacklevel=2)
dtype = CategoricalDtype(categories, ordered)
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
ordered, dtype)

codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
@@ -1223,8 +1211,9 @@ def map(self, mapper):
"""
new_categories = self.categories.map(mapper)
try:
new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
return self.from_codes(self._codes.copy(), dtype=new_dtype)
return self.from_codes(self._codes.copy(),
categories=new_categories,
ordered=self.ordered)
except ValueError:
# NA values are represented in self._codes with -1
# np.take causes NA values to take final element in new_categories
@@ -14,7 +14,6 @@
from pandas.core.dtypes.common import (
ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable,
is_list_like, is_scalar, is_timedelta64_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import ABCSeries

import pandas.core.algorithms as algorithms
@@ -293,19 +292,21 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
from pandas.core.groupby.categorical import recode_for_groupby
self.grouper, self.all_grouper = recode_for_groupby(
self.grouper, self.sort, observed)
dtype = CategoricalDtype(self.grouper.categories,
ordered=self.grouper.ordered)
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._labels = self.grouper.codes
if observed:
codes = algorithms.unique1d(self.grouper.codes)
else:
codes = np.arange(len(dtype.categories))
codes = np.arange(len(categories))

self._group_index = CategoricalIndex(
Categorical.from_codes(codes=codes, dtype=dtype))
Categorical.from_codes(
codes=codes,
categories=categories,
ordered=self.grouper.ordered))

# we are done
if isinstance(self.grouper, Grouping):
@@ -394,8 +395,8 @@ def _make_labels(self):

@cache_readonly
def groups(self):
return self.index.groupby(
Categorical(self.labels, self.group_index, fastpath=True))
return self.index.groupby(Categorical.from_codes(self.labels,
self.group_index))


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
@@ -18,8 +18,7 @@
ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
pandas_dtype)
from pandas.core.dtypes.dtypes import (
CategoricalDtype, ExtensionDtype, PandasExtensionDtype)
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
from pandas.core.dtypes.generic import ABCDataFrame
from pandas.core.dtypes.missing import array_equivalent, isna

@@ -2027,14 +2026,13 @@ def _get_codes_for_sorting(self):
"""
from pandas.core.arrays import Categorical

def as_dtype(level_codes):
cats = np.arange(np.array(level_codes).max() + 1 if
def cats(level_codes):
return np.arange(np.array(level_codes).max() + 1 if
len(level_codes) else 0,
dtype=level_codes.dtype)
return CategoricalDtype(cats, ordered=True)

return [Categorical.from_codes(level_codes,
dtype=as_dtype(level_codes))
return [Categorical.from_codes(level_codes, cats(level_codes),
ordered=True)
for level_codes in self.codes]

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
@@ -55,7 +55,6 @@
from pandas.core.dtypes.common import (
is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
needs_i8_conversion, pandas_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype as CDT

from pandas import ( # noqa:F401
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
@@ -622,8 +621,9 @@ def decode(obj):
name=obj[u'name'])
elif typ == u'category':
from_codes = globals()[obj[u'klass']].from_codes
dtype = CDT(obj[u'categories'], ordered=obj[u'ordered'])
return from_codes(codes=obj[u'codes'], dtype=dtype)
return from_codes(codes=obj[u'codes'],
categories=obj[u'categories'],
ordered=obj[u'ordered'])

elif typ == u'interval':
return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
@@ -24,7 +24,6 @@
ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype,
is_datetime64_dtype, is_datetime64tz_dtype, is_list_like,
is_timedelta64_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import array_equivalent

from pandas import (
@@ -2207,8 +2206,10 @@ def convert(self, values, nan_rep, encoding, errors):
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values

dtype = CategoricalDtype(categories, ordered=self.ordered)
self.data = Categorical.from_codes(codes, dtype=dtype)
self.data = Categorical.from_codes(codes,
categories=categories,
ordered=self.ordered)

else:

try:
@@ -21,13 +21,18 @@ class TestCategoricalConstructors(object):
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError

# This should be a boolean or None.
# This should be a boolean.
ordered = np.array([0, 1, 2])

with pytest.raises(TypeError, match=exp_msg):
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)

with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'],
ordered=ordered)

def test_constructor_empty(self):
# GH 17248
c = Categorical([])
@@ -416,41 +421,76 @@ def test_constructor_with_categorical_categories(self):
tm.assert_categorical_equal(result, expected)

def test_from_codes(self):
dtype = CategoricalDtype(categories=[1, 2])

# no dtype or categories
msg = 'Must specify `dtype`.'
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2])

# too few categories
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)

# no int codes
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)

# no unique categories
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])

# NaN categories included
with pytest.raises(ValueError,
match="Categorial categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])

# too negative
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)

exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)

res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)

codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
dtype = CategoricalDtype(categories=["train", "test"])
Categorical.from_codes(codes, categories=dtype.categories)
Categorical.from_codes(codes, dtype=dtype)

This comment has been minimized.

Copy link
@jschendel

jschendel Jan 7, 2019

Member

I don't follow what this last example is supposed to be testing. Is there a missing assert of some kind? Or is this just not supposed to raise?

This comment has been minimized.

Copy link
@TomAugspurger

TomAugspurger Jan 8, 2019

Contributor

Yeah, this test is duplicative with earlier ones (even on master). I'd be OK with removing it.


def test_from_codes_with_categorical_categories(self):
# GH17884
expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])

result = Categorical.from_codes(
[0, 1], categories=Categorical(['a', 'b', 'c']))
tm.assert_categorical_equal(result, expected)

result = Categorical.from_codes(
[0, 1], categories=CategoricalIndex(['a', 'b', 'c']))
tm.assert_categorical_equal(result, expected)

# non-unique Categorical still raises
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))

def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=['a', 'b', 'c'])
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
@@ -460,43 +500,36 @@ def test_from_codes_with_float(self):
codes = [1.0, 2.0, 0] # integer, but in float dtype
dtype = CategoricalDtype(categories=['a', 'b', 'c'])

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype.categories)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype=dtype)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

codes = [1.1, 2.0, 0] # non-integer
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)

def test_from_codes_deprecated(self, ordered):
# GH24398
cats = ['a', 'b']
with tm.assert_produces_warning(FutureWarning):
Categorical.from_codes([0, 1], categories=cats)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
Categorical.from_codes([0, 1], categories=cats, ordered=True)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
Categorical.from_codes([0, 1], categories=cats, ordered=False)

@pytest.mark.parametrize('dtype', [None, 'category'])
def test_from_inferred_categories(self, dtype):
cats = ['a', 'b']
codes = np.array([0, 0, 1, 1], dtype='i8')
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes,
dtype=CategoricalDtype(cats))
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)

@pytest.mark.parametrize('dtype', [None, 'category'])
def test_from_inferred_categories_sorts(self, dtype):
cats = ['b', 'a']
codes = np.array([0, 1, 1, 1], dtype='i8')
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0],
dtype=CategoricalDtype(['a', 'b']))
expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
tm.assert_categorical_equal(result, expected)

def test_from_inferred_categories_dtype(self):
@@ -1,29 +1,25 @@
# -*- coding: utf-8 -*-

from pandas import Categorical
from pandas.api.types import CategoricalDtype
import pandas.util.testing as tm


class TestCategoricalSubclassing(object):

def test_constructor(self):
subclassed = tm.SubclassedCategorical(['a', 'b', 'c'])
assert isinstance(subclassed, tm.SubclassedCategorical)
tm.assert_categorical_equal(subclassed, Categorical(['a', 'b', 'c']))
sc = tm.SubclassedCategorical(['a', 'b', 'c'])
assert isinstance(sc, tm.SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))

def test_from_codes(self):
dtype = CategoricalDtype(['a', 'b', 'c'])
subclassed = tm.SubclassedCategorical.from_codes([1, 0, 2],
dtype=dtype)
assert isinstance(subclassed, tm.SubclassedCategorical)

expected = Categorical.from_codes([1, 0, 2], dtype=dtype)
tm.assert_categorical_equal(subclassed, expected)
sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
assert isinstance(sc, tm.SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
tm.assert_categorical_equal(sc, exp)

def test_map(self):
subclassed = tm.SubclassedCategorical(['a', 'b', 'c'])
result = subclassed.map(lambda x: x.upper())
assert isinstance(result, tm.SubclassedCategorical)
expected = Categorical(['A', 'B', 'C'])
tm.assert_categorical_equal(result, expected)
sc = tm.SubclassedCategorical(['a', 'b', 'c'])
res = sc.map(lambda x: x.upper())
assert isinstance(res, tm.SubclassedCategorical)
exp = Categorical(['A', 'B', 'C'])
tm.assert_categorical_equal(res, exp)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.