Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Add dtype parameter to Categorical.from_codes #24398

Merged
merged 10 commits into from Jan 8, 2019
@@ -403,6 +403,7 @@ Other Enhancements
- :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`).
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
@@ -603,13 +603,13 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
return cls(codes, dtype=dtype, fastpath=True)

@classmethod
def from_codes(cls, codes, categories, ordered=False):
def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
This conversation was marked as resolved by jreback

This comment has been minimized.

Copy link
@jreback

jreback Dec 23, 2018

Contributor

I would rather deprecate categories & ordered here in favor of dtype

This comment has been minimized.

Copy link
@topper-123

topper-123 Dec 23, 2018

Author Contributor

Yeah, ok, I've updated.

"""
Make a Categorical type from codes and categories arrays.
Make a Categorical type from codes and categories or dtype.
This constructor is useful if you already have codes and categories and
so do not need the (computation intensive) factorization step, which is
usually done on the constructor.
This constructor is useful if you already have codes and
categories/dtype and so do not need the (computation intensive)
factorization step, which is usually done on the constructor.
If your data does not follow this convention, please use the normal
constructor.
@@ -618,16 +618,38 @@ def from_codes(cls, codes, categories, ordered=False):
----------
codes : array-like, integers
An integer array, where each integer points to a category in
categories or -1 for NaN
categories : index-like
categories or dtype.categories, or else is -1 for NaN
This conversation was marked as resolved by TomAugspurger

This comment has been minimized.

Copy link
@TomAugspurger

TomAugspurger Jan 8, 2019

Contributor
Suggested change
categories or dtype.categories, or else is -1 for NaN
`categories` or ``dtype.categories``, or else is -1 for NaN
categories : index-like, optional
The categories for the categorical. Items need to be unique.
ordered : boolean, (default False)
Whether or not this categorical is treated as a ordered
categorical. If not given, the resulting categorical will be
unordered.
"""
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
ordered)
If the categories are not given here, then they must be provided
in `dtype`.
ordered : bool, optional
Whether or not this categorical is treated as an ordered
categorical. If not given here or in `dtype`, the resulting
categorical will be unordered.
dtype : CategoricalDtype or the string "category", optional
If :class:`CategoricalDtype`, cannot be used together with
`categories` or `ordered`.
.. versionadded:: 0.24.0
When `dtype` is provided, neither `categories` nor `ordered`
should be provided.
Examples
--------
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
[a, b, a, b]
Categories (2, object): [a < b]
"""
dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
ordered=ordered,
dtype=dtype)
if dtype.categories is None:
msg = ("The categories must be provided in 'categories' or "
"'dtype'. Both were None.")
raise ValueError(msg)

codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
@@ -642,12 +664,6 @@ def from_codes(cls, codes, categories, ordered=False):
if msg:
raise ValueError(msg)

try:
codes = coerce_indexer_dtype(codes, categories)
except (ValueError, TypeError):
raise ValueError(
"codes need to be convertible to an arrays of integers")

This conversation was marked as resolved by TomAugspurger

This comment has been minimized.

Copy link
@topper-123

topper-123 Dec 22, 2018

Author Contributor

fastpath is always True in this constructor, and coerce_indexer_dtype is already called in the init constructor when fastpath=True, so calling here is not needed.

if len(codes) and (
codes.max() >= len(dtype.categories) or codes.min() < -1):
raise ValueError("codes need to be between -1 and "
@@ -1265,8 +1281,7 @@ def shift(self, periods, fill_value=None):
else:
codes[periods:] = fill_value

return self.from_codes(codes, categories=self.categories,
ordered=self.ordered)
return self.from_codes(codes, dtype=self.dtype)

def __array__(self, dtype=None):
"""
@@ -1887,9 +1902,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):

codes = take(self._codes, indexer, allow_fill=allow_fill,
fill_value=fill_value)
result = type(self).from_codes(codes,
categories=dtype.categories,
ordered=dtype.ordered)
result = type(self).from_codes(codes, dtype=dtype)
return result

take = take_nd
@@ -2078,9 +2091,7 @@ def __setitem__(self, key, value):
new_codes = _recode_for_categories(
value.codes, value.categories, self.categories
)
value = Categorical.from_codes(new_codes,
categories=self.categories,
ordered=self.ordered)
value = Categorical.from_codes(new_codes, dtype=self.dtype)

rvalue = value if is_list_like(value) else [value]

@@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
dtype = self.dtype
if name is None:
name = self.name
cat = Categorical.from_codes(codes, categories=dtype.categories,
ordered=dtype.ordered)
cat = Categorical.from_codes(codes, dtype=dtype)
return CategoricalIndex(cat, name=name)

@classmethod
@@ -77,7 +77,9 @@ def test_constructor_unsortable(self):
assert not factor.ordered

# this however will raise as cannot be sorted
with pytest.raises(TypeError):
msg = ("'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument.")
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)

def test_constructor_interval(self):
@@ -99,10 +101,11 @@ def test_constructor(self):
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)

# categories must be unique
with pytest.raises(ValueError):
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])

# The default should be unordered
@@ -211,21 +214,23 @@ def test_constructor(self):

def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
with pytest.raises(TypeError):
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(['a', 'b'], categories='a')

def test_constructor_with_null(self):

# Cannot have NaN in categories
with pytest.raises(ValueError):
msg = "Categorial categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"],
categories=[None, "a", "b", "c"])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical(DatetimeIndex(['nat', '20160101']),
categories=[NaT, Timestamp('20160101')])

@@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered):

def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(['a', 'b'], ordered=True)
with pytest.raises(ValueError, match="Cannot"):
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)

with pytest.raises(ValueError, match="Cannot"):
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], ordered=True, dtype=dtype)

with pytest.raises(ValueError, match="Cannot"):
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], ordered=False, dtype=dtype)

@pytest.mark.parametrize('categories', [
@@ -417,33 +423,44 @@ def test_constructor_with_categorical_categories(self):
def test_from_codes(self):

# too few categories
with pytest.raises(ValueError):
Categorical.from_codes([1, 2], [1, 2])
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)

# no int codes
with pytest.raises(ValueError):
Categorical.from_codes(["a"], [1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)

# no unique categories
with pytest.raises(ValueError):
Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])

# NaN categories included
with pytest.raises(ValueError):
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
with pytest.raises(ValueError,
match="Categorial categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])

# too negative
with pytest.raises(ValueError):
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)

exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)

# Not available in earlier numpy versions
if hasattr(np.random, "choice"):
codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
Categorical.from_codes(codes, categories=["train", "test"])
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)

def test_from_codes_with_categorical_categories(self):
# GH17884
@@ -458,28 +475,56 @@ def test_from_codes_with_categorical_categories(self):
tm.assert_categorical_equal(result, expected)

# non-unique Categorical still raises
with pytest.raises(ValueError):
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))

def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
categories = ['a', 'b', 'c']
with pytest.raises(ValueError):
Categorical.from_codes(codes, categories)
dtype = CategoricalDtype(categories=['a', 'b', 'c'])
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)

def test_from_codes_with_float(self):
# GH21767
codes = [1.0, 2.0, 0] # integer, but in float dtype
categories = ['a', 'b', 'c']
dtype = CategoricalDtype(categories=['a', 'b', 'c'])

with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype.categories)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, categories)
cat = Categorical.from_codes(codes, dtype=dtype)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

codes = [1.1, 2.0, 0] # non-integer
with pytest.raises(ValueError):
Categorical.from_codes(codes, categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)

def test_from_codes_with_dtype_raises(self):
msg = 'Cannot specify'
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1], categories=['a', 'b'],
dtype=CategoricalDtype(['a', 'b']))

with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1], ordered=True,
dtype=CategoricalDtype(['a', 'b']))

def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])

@pytest.mark.parametrize('dtype', [None, 'category'])
def test_from_inferred_categories(self, dtype):
@@ -515,14 +560,11 @@ def test_from_inferred_categories_coerces(self):
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)

def test_construction_with_ordered(self):
@pytest.mark.parametrize('ordered', [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2])
assert not cat.ordered
cat = Categorical([0, 1, 2], ordered=False)
assert not cat.ordered
cat = Categorical([0, 1, 2], ordered=True)
assert cat.ordered
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)

@pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
def test_constructor_imaginary(self):
@@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
tm.assert_index_equal(result, expected, exact=True)

# error when combining categories/ordered and dtype kwargs
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
CategoricalIndex(data, categories=cats, dtype=dtype)

ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.