From 55e20a2d0b0d472037b761721888d4d9ff01ff3d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 10 Sep 2020 11:18:27 -0700
Subject: [PATCH 1/2] REF: implement Categorical._validate_listlike

---
 pandas/core/arrays/categorical.py | 44 ++++++++++++++++++++++---------
 pandas/core/dtypes/concat.py      | 10 ++-----
 pandas/core/indexes/category.py   | 29 +++-----------------
 pandas/core/reshape/merge.py      |  9 ++-----
 4 files changed, 39 insertions(+), 53 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index a2b5b54c55490..fe66a8c07e13a 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1731,6 +1731,35 @@ def _box_func(self, i: int):
             return np.NaN
         return self.categories[i]
 
+    def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
+        """
+        Extract integer codes we can use for comparison.
+
+        Notes
+        -----
+        If a value in target is not present, it gets coded as -1.
+        """
+
+        if isinstance(target, Categorical):
+            # Indexing on codes is more efficient if categories are the same,
+            #  so we can apply some optimizations based on the degree of
+            #  dtype-matching.
+            if self.categories.equals(target.categories):
+                # We use the same codes, so can go directly to the engine
+                codes = target.codes
+            elif self.is_dtype_equal(target):
+                # We have the same categories up to a reshuffling of codes.
+                codes = recode_for_categories(
+                    target.codes, target.categories, self.categories
+                )
+            else:
+                code_indexer = self.categories.get_indexer(target.categories)
+                codes = take_1d(code_indexer, target.codes, fill_value=-1)
+        else:
+            codes = self.categories.get_indexer(target)
+
+        return codes
+
     # ------------------------------------------------------------------
 
     def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1905,11 +1934,8 @@ def _validate_setitem_value(self, value):
                     "Cannot set a Categorical with another, "
                     "without identical categories"
                 )
-            if not self.categories.equals(value.categories):
-                new_codes = recode_for_categories(
-                    value.codes, value.categories, self.categories
-                )
-                value = Categorical.from_codes(new_codes, dtype=self.dtype)
+            new_codes = self._validate_listlike(value)
+            value = Categorical.from_codes(new_codes, dtype=self.dtype)
 
         rvalue = value if is_list_like(value) else [value]
 
@@ -2179,13 +2205,7 @@ def equals(self, other: object) -> bool:
         if not isinstance(other, Categorical):
             return False
         elif self.is_dtype_equal(other):
-            if self.categories.equals(other.categories):
-                # fastpath to avoid re-coding
-                other_codes = other._codes
-            else:
-                other_codes = recode_for_categories(
-                    other.codes, other.categories, self.categories
-                )
+            other_codes = self._validate_listlike(other)
             return np.array_equal(self._codes, other_codes)
         return False
 
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index dd005752a4832..1ea4ff117f209 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -310,14 +310,8 @@ def _maybe_unwrap(x):
         categories = first.categories
         ordered = first.ordered
 
-        if all(first.categories.equals(other.categories) for other in to_union[1:]):
-            new_codes = np.concatenate([c.codes for c in to_union])
-        else:
-            codes = [first.codes] + [
-                recode_for_categories(other.codes, other.categories, first.categories)
-                for other in to_union[1:]
-            ]
-            new_codes = np.concatenate(codes)
+        all_codes = [first._validate_listlike(x) for x in to_union]
+        new_codes = np.concatenate(all_codes)
 
         if sort_categories and not ignore_order and ordered:
             raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 7509cb35069e8..a450e3b9fdee7 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -23,8 +23,7 @@
 from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna
 
 from pandas.core import accessor
-from pandas.core.algorithms import take_1d
-from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories
+from pandas.core.arrays.categorical import Categorical, contains
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 import pandas.core.indexes.base as ibase
@@ -558,21 +557,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "method='nearest' not implemented yet for CategoricalIndex"
             )
 
-        if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target):
-            if self._values.equals(target._values):
-                # we have the same codes
-                codes = target.codes
-            else:
-                codes = recode_for_categories(
-                    target.codes, target.categories, self._values.categories
-                )
-        else:
-            if isinstance(target, CategoricalIndex):
-                code_indexer = self.categories.get_indexer(target.categories)
-                codes = take_1d(code_indexer, target.codes, fill_value=-1)
-            else:
-                codes = self.categories.get_indexer(target)
-
+        codes = self._values._validate_listlike(target._values)
         indexer, _ = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer)
 
@@ -580,15 +565,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
     def get_indexer_non_unique(self, target):
         target = ibase.ensure_index(target)
 
-        if isinstance(target, CategoricalIndex):
-            # Indexing on codes is more efficient if categories are the same:
-            if target.categories is self.categories:
-                target = target.codes
-                indexer, missing = self._engine.get_indexer_non_unique(target)
-                return ensure_platform_int(indexer), missing
-            target = target._values
-
-        codes = self.categories.get_indexer(target)
+        codes = self._values._validate_listlike(target._values)
         indexer, missing = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer), missing
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 9f19ea9aefe09..d95355589fd0c 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -43,7 +43,6 @@
 from pandas import Categorical, Index, MultiIndex
 from pandas.core import groupby
 import pandas.core.algorithms as algos
-from pandas.core.arrays.categorical import recode_for_categories
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.frame import _merge_doc
@@ -1936,12 +1935,8 @@ def _factorize_keys(
     ):
         assert isinstance(lk, Categorical)
         assert isinstance(rk, Categorical)
-        if lk.categories.equals(rk.categories):
-            # if we exactly match in categories, allow us to factorize on codes
-            rk = rk.codes
-        else:
-            # Same categories in different orders -> recode
-            rk = recode_for_categories(rk.codes, rk.categories, lk.categories)
+        # Cast rk to encoding so we can compare codes with lk
+        rk = lk._validate_listlike(rk)
 
         lk = ensure_int64(lk.codes)
         rk = ensure_int64(rk)

From 9297ead9dde7b8ed51c44ddbb8f7b473dba20f48 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 12 Sep 2020 20:07:35 -0700
Subject: [PATCH 2/2] REF: De-duplicate get_indexer_non_unique

---
 pandas/core/indexes/multi.py  |  4 ----
 pandas/core/indexes/period.py | 19 +++----------------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 7aceb898f5ccf..c31428e78dab5 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -2512,10 +2512,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return ensure_platform_int(indexer)
 
-    @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
-    def get_indexer_non_unique(self, target):
-        return super().get_indexer_non_unique(target)
-
     def get_slice_bound(
         self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str
     ) -> int:
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index 5282b6f0154b4..42dce1bd53f22 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -12,7 +12,6 @@
 from pandas.util._decorators import Appender, cache_readonly, doc
 
 from pandas.core.dtypes.common import (
-    ensure_platform_int,
     is_bool_dtype,
     is_datetime64_any_dtype,
     is_dtype_equal,
@@ -473,12 +472,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         target = ensure_index(target)
 
         if isinstance(target, PeriodIndex):
-            if target.freq != self.freq:
+            if not self._is_comparable_dtype(target.dtype):
+                # i.e. target.freq != self.freq
                 # No matches
                 no_matches = -1 * np.ones(self.shape, dtype=np.intp)
                 return no_matches
 
-            target = target.asi8
+            target = target._get_engine_target()  # i.e. target.asi8
             self_index = self._int64index
         else:
             self_index = self
@@ -491,19 +491,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return Index.get_indexer(self_index, target, method, limit, tolerance)
 
-    @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
-    def get_indexer_non_unique(self, target):
-        target = ensure_index(target)
-
-        if not self._is_comparable_dtype(target.dtype):
-            no_matches = -1 * np.ones(self.shape, dtype=np.intp)
-            return no_matches, no_matches
-
-        target = target.asi8
-
-        indexer, missing = self._int64index.get_indexer_non_unique(target)
-        return ensure_platform_int(indexer), missing
-
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get integer location for requested label.