pandas-dev · phofl · Nov 9, 2022 · Sep 30, 2022 · Sep 30, 2022 · Sep 30, 2022
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -221,6 +221,7 @@ MultiIndex
 - Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`)
 - Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`)
 - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
+- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`48877`)
 - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`)
 -
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4028,6 +4028,11 @@ def get_indexer(
                 target, method=method, limit=limit, tolerance=tolerance
             )
 
+        if self._is_multi and self._contains_na_any_level:
+            # GH48877
+            # MultiIndex requires special treatment to handle missing values
+            return Index(self._values).get_indexer_for(target)
+
         return self._get_indexer(target, method, limit, tolerance)
 
     def _get_indexer(

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2613,6 +2613,15 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index:
             target.names = self.names
         return target
 
+    @cache_readonly
+    def _contains_na_any_level(self) -> bool:
+        """
+        Return True if any level of the index contains missing values.
+        """
+        if len(self) == 0:
+            return False
+        return any(c.min() == -1 for c in self.codes)
+
     # --------------------------------------------------------------------
     # Indexing Methods
 

diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -924,3 +924,17 @@ def test_get_locs_reordering(keys, expected):
     result = idx.get_locs(keys)
     expected = np.array(expected, dtype=np.intp)
     tm.assert_numpy_array_equal(result, expected)
+
+
+def test_get_indexer_for_multiindex_with_nans(nulls_fixture):
+    # GH37222
+    idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
+    idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"])
+
+    result = idx2.get_indexer(idx1)
+    expected = np.array([-1, 1])
+    tm.assert_numpy_array_equal(result, expected)
+
+    result = idx1.get_indexer(idx2)
+    expected = np.array([-1, 1])
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py
@@ -2,6 +2,7 @@
 import pytest
 
 from pandas import (
+    DataFrame,
     Index,
     Interval,
     MultiIndex,
@@ -158,3 +159,21 @@ def test_join_overlapping_interval_level():
     result = idx_1.join(idx_2, how="outer")
 
     tm.assert_index_equal(result, expected)
+
+
+def test_join_multi_with_nan():
+    # GH29252
+    df1 = DataFrame(
+        data={"col1": [1.1, 1.2]},
+        index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
+    )
+    df2 = DataFrame(
+        data={"col2": [2.1, 2.2]},
+        index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]),
+    )
+    result = df1.join(df2)
+    expected = DataFrame(
+        data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
+        index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py
@@ -159,3 +159,15 @@ def test_reindex_limit_arg_with_multiindex():
         match="limit argument only valid if doing pad, backfill or nearest reindexing",
     ):
         df.reindex(new_idx, fill_value=0, limit=1)
+
+
+def test_reindex_with_none_in_nested_multiindex():
+    # GH42883
+    index = MultiIndex.from_tuples([(("a", None), 1), (("b", None), 2)])
+    index2 = MultiIndex.from_tuples([(("b", None), 2), (("a", None), 1)])
+    df1_dtype = pd.DataFrame([1, 2], index=index)
+    df2_dtype = pd.DataFrame([2, 1], index=index2)
+
+    result = df1_dtype.reindex_like(df2_dtype)
+    expected = df2_dtype
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from pandas import (
     CategoricalIndex,
+    DataFrame,
     Index,
     IntervalIndex,
     MultiIndex,
@@ -550,6 +551,15 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture):
     tm.assert_index_equal(result, expected)
 
 
+def test_union_with_missing_values_on_both_sides(nulls_fixture):
+    # GH#38623
+    mi1 = MultiIndex.from_arrays([[1, np.nan]])
+    mi2 = MultiIndex.from_arrays([[1, np.nan, 3]])
+    result = mi1.union(mi2)
+    expected = MultiIndex.from_arrays([[1, np.nan, 3]])
+    tm.assert_index_equal(result, expected)
+
+
 def test_union_nan_got_duplicated():
     # GH#38977
     mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]])
@@ -620,15 +630,13 @@ def test_union_keep_dtype_precision(any_real_numeric_dtype):
 
 def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype):
     # GH#48498
-
     arr1 = Series([4, pd.NA], dtype=any_numeric_ea_dtype)
     arr2 = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
     midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None])
     midx2 = MultiIndex.from_arrays([arr2, [1, 2]])
     result = midx.union(midx2)
-    # Expected is actually off and should contain (1, 1) too. See GH#37222
     expected = MultiIndex.from_arrays(
-        [Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]]
+        [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]]
     )
     tm.assert_index_equal(result, expected)
 
@@ -667,3 +675,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype):
     result = midx.intersection(midx2)
     expected = MultiIndex.from_arrays([Series([2], dtype=any_numeric_ea_dtype), [1]])
     tm.assert_index_equal(result, expected)
+
+
+def test_union_with_na_when_constructing_dataframe():
+    # GH43222
+    series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),)))
+    series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b"))))
+    result = DataFrame([series1, series2])
+    expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self):
                     [1, 2],
                 ],
                 [
-                    [(81.0, np.nan), (np.nan, np.nan)],
-                    [(81.0, np.nan), (np.nan, np.nan)],
-                    [1, 2],
-                    [1, 1],
+                    [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)],
+                    [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)],
+                    [1, 2, np.nan],
+                    [np.nan, 1, 2],
                 ],
             ),
             (
@@ -227,3 +227,15 @@ def test_multiindex_repeated_keys(self):
             ],
             Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
         )
+
+    def test_multiindex_with_na_missing_key(self):
+        # GH46173
+        df = DataFrame.from_dict(
+            {
+                ("foo",): [1, 2, 3],
+                ("bar",): [5, 6, 7],
+                (None,): [8, 9, 0],
+            }
+        )
+        with pytest.raises(KeyError, match="missing_key"):
+            df[[("missing_key",)]]