From 55e20a2d0b0d472037b761721888d4d9ff01ff3d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Sep 2020 11:18:27 -0700 Subject: [PATCH 1/2] REF: implement Categorical._validate_listlike --- pandas/core/arrays/categorical.py | 44 ++++++++++++++++++++++--------- pandas/core/dtypes/concat.py | 10 ++----- pandas/core/indexes/category.py | 29 +++----------------- pandas/core/reshape/merge.py | 9 ++----- 4 files changed, 39 insertions(+), 53 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a2b5b54c55490..fe66a8c07e13a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1731,6 +1731,35 @@ def _box_func(self, i: int): return np.NaN return self.categories[i] + def _validate_listlike(self, target: ArrayLike) -> np.ndarray: + """ + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. + """ + + if isinstance(target, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + if self.categories.equals(target.categories): + # We use the same codes, so can go directly to the engine + codes = target.codes + elif self.is_dtype_equal(target): + # We have the same categories up to a reshuffling of codes. + codes = recode_for_categories( + target.codes, target.categories, self.categories + ) + else: + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) + + return codes + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1905,11 +1934,8 @@ def _validate_setitem_value(self, value): "Cannot set a Categorical with another, " "without identical categories" ) - if not self.categories.equals(value.categories): - new_codes = recode_for_categories( - value.codes, value.categories, self.categories - ) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + new_codes = self._validate_listlike(value) + value = Categorical.from_codes(new_codes, dtype=self.dtype) rvalue = value if is_list_like(value) else [value] @@ -2179,13 +2205,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self.is_dtype_equal(other): - if self.categories.equals(other.categories): - # fastpath to avoid re-coding - other_codes = other._codes - else: - other_codes = recode_for_categories( - other.codes, other.categories, self.categories - ) + other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dd005752a4832..1ea4ff117f209 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -310,14 +310,8 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) + all_codes = [first._validate_listlike(x) for x in to_union] + new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7509cb35069e8..a450e3b9fdee7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -23,8 +23,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna from pandas.core import accessor -from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -558,21 +557,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): - if self._values.equals(target._values): - # we have the same codes - codes = target.codes - else: - codes = recode_for_categories( - target.codes, target.categories, self._values.categories - ) - else: - if isinstance(target, CategoricalIndex): - code_indexer = self.categories.get_indexer(target.categories) - codes = take_1d(code_indexer, target.codes, fill_value=-1) - else: - codes = self.categories.get_indexer(target) - + codes = self._values._validate_listlike(target._values) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) @@ -580,15 +565,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - if isinstance(target, CategoricalIndex): - # Indexing on codes is more efficient if categories are the same: - if target.categories is self.categories: - target = target.codes - indexer, missing = self._engine.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - target = target._values - - codes = self.categories.get_indexer(target) + codes = self._values._validate_listlike(target._values) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9f19ea9aefe09..d95355589fd0c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -43,7 +43,6 @@ from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos -from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -1936,12 +1935,8 @@ def _factorize_keys( ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) - if lk.categories.equals(rk.categories): - # if we exactly match in categories, allow us to factorize on codes - rk = rk.codes - else: - # Same categories in different orders -> recode - rk = recode_for_categories(rk.codes, rk.categories, lk.categories) + # Cast rk to encoding so we can compare codes with lk + rk = lk._validate_listlike(rk) lk = ensure_int64(lk.codes) rk = ensure_int64(rk) From 9297ead9dde7b8ed51c44ddbb8f7b473dba20f48 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 12 Sep 2020 20:07:35 -0700 Subject: [PATCH 2/2] REF: De-duplicate get_indexer_non_unique --- pandas/core/indexes/multi.py | 4 ---- pandas/core/indexes/period.py | 19 +++---------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7aceb898f5ccf..c31428e78dab5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2512,10 +2512,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return ensure_platform_int(indexer) - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - return super().get_indexer_non_unique(target) - def get_slice_bound( self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str ) -> int: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5282b6f0154b4..42dce1bd53f22 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -12,7 +12,6 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -473,12 +472,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) if isinstance(target, PeriodIndex): - if target.freq != self.freq: + if not self._is_comparable_dtype(target.dtype): + # i.e. target.freq != self.freq # No matches no_matches = -1 * np.ones(self.shape, dtype=np.intp) return no_matches - target = target.asi8 + target = target._get_engine_target() # i.e. target.asi8 self_index = self._int64index else: self_index = self @@ -491,19 +491,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return Index.get_indexer(self_index, target, method, limit, tolerance) - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - - if not self._is_comparable_dtype(target.dtype): - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches - - target = target.asi8 - - indexer, missing = self._int64index.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label.