From e0d3bb48f5d043d8c9f8f4b66fe60112b6beeb0c Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Wed, 15 May 2019 17:52:10 +0800 Subject: [PATCH 01/17] BUG: MultiIndex not dropping nan level and invalid code value --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/core/indexes/multi.py | 12 ++++++++++++ pandas/tests/indexes/multi/test_constructor.py | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 49518c57fc846..29086ef6215a5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -387,8 +387,8 @@ MultiIndex ^^^^^^^^^^ - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) -- -- +- Bug in having code value < -1 +- Bug in not dropping nan level (:issue:`19387`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7a933bdcb0953..35db36b650b64 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,10 +243,19 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, if verify_integrity: result._verify_integrity() + + codes = [cls._reassign_na_codes(*it) for it in zip(levels, codes)] + result._set_codes(codes, validate=False) + if _set_identity: result._reset_identity() + return result + @classmethod + def _reassign_na_codes(cls, level, code): + return [-1 if x == -1 or isna(level[x]) else x for x in code] + def _verify_integrity(self, codes=None, levels=None): """ @@ -281,6 +290,9 @@ def _verify_integrity(self, codes=None, levels=None): " level (%d). NOTE: this index is in an" " inconsistent state" % (i, level_codes.max(), len(level))) + if len(level_codes) and level_codes.min() < -1: + raise ValueError("On level %d, code value (%d) < -1" % + (i, level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 639e6aee87c6f..11dd5a7abcde8 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -67,6 +67,7 @@ def test_constructor_mismatched_codes_levels(idx): length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = (r"On level 0, code value \(-2\) < -1") # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): @@ -83,6 +84,23 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + tm.assert_index_equal( + MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]), + MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]])) + tm.assert_index_equal( + MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]), + MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]])) + def test_labels_deprecated(idx): # GH23752 From af2ad9106025faf0c59bfc333075061fe57fc420 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 16 May 2019 09:09:40 +0800 Subject: [PATCH 02/17] update whatsnew statement --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 29086ef6215a5..b185d9b0e539d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -387,8 +387,8 @@ MultiIndex ^^^^^^^^^^ - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) -- Bug in having code value < -1 -- Bug in not dropping nan level (:issue:`19387`) +- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows code values < -1 +- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows NaN levels (:issue:`19387`) I/O ^^^ From 248177bfe7908266b036a08dc567e04ef73f76ac Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 16 May 2019 11:15:50 +0800 Subject: [PATCH 03/17] vectorise reassignment of na codes --- pandas/core/indexes/multi.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 35db36b650b64..5a60287acc19f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -244,7 +244,8 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, if verify_integrity: result._verify_integrity() - codes = [cls._reassign_na_codes(*it) for it in zip(levels, codes)] + codes = [cls._reassign_na_codes(level, code) + for level, code in zip(levels, codes)] result._set_codes(codes, validate=False) if _set_identity: @@ -254,7 +255,10 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, @classmethod def _reassign_na_codes(cls, level, code): - return [-1 if x == -1 or isna(level[x]) else x for x in code] + null_mask = isna(level) + if np.any(null_mask): + code = np.where((code == -1) | null_mask[code], -1, code) + return code def _verify_integrity(self, codes=None, levels=None): """ From b6d29bd747a1e2347b5563dc5a4a3ab5cdd61e3b Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 16 May 2019 12:20:19 +0800 Subject: [PATCH 04/17] remove redundant boundary check --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5a60287acc19f..390741aaa5800 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -257,7 +257,7 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, def _reassign_na_codes(cls, level, code): null_mask = isna(level) if np.any(null_mask): - code = np.where((code == -1) | null_mask[code], -1, code) + code = np.where(null_mask[code], -1, code) return code def _verify_integrity(self, codes=None, levels=None): From 041b4cdcaf6913cb3b8eb56bb5760c7517ce20ad Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Fri, 17 May 2019 11:02:48 +0800 Subject: [PATCH 05/17] refactor validate code and add tests for set_code and set_level --- doc/source/whatsnew/v0.25.0.rst | 3 +- pandas/core/indexes/multi.py | 30 ++++++++-------- .../tests/indexes/multi/test_constructor.py | 34 +++++++++++++------ 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b185d9b0e539d..c8c97c4ff82df 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -387,8 +387,7 @@ MultiIndex ^^^^^^^^^^ - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) -- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows code values < -1 -- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows NaN levels (:issue:`19387`) +- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows code values < -1 or NaN levels (:issue:`19387`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 390741aaa5800..7edddde5a8919 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -244,17 +244,12 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, if verify_integrity: result._verify_integrity() - codes = [cls._reassign_na_codes(level, code) - for level, code in zip(levels, codes)] - result._set_codes(codes, validate=False) - if _set_identity: result._reset_identity() return result - @classmethod - def _reassign_na_codes(cls, level, code): + def _validate_codes(cls, level, code): null_mask = isna(level) if np.any(null_mask): code = np.where(null_mask[code], -1, code) @@ -290,19 +285,26 @@ def _verify_integrity(self, codes=None, levels=None): raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level %d, code max (%d) >= length of" - " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, level_codes.max(), - len(level))) + raise ValueError("On level {level}, code max ({max_code})" + " >= length of level ({level_len}). " + "NOTE: this index is in an inconsistent" + " state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) if len(level_codes) and level_codes.min() < -1: - raise ValueError("On level %d, code value (%d) < -1" % - (i, level_codes.min())) + raise ValueError("On level {level}, code value ({code})" + " < -1".format( + level=i, code=level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( values=[value for value in level], level=i)) + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + self._set_codes(codes, validate=False) + @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ @@ -691,7 +693,6 @@ def labels(self): def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): - if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): @@ -712,8 +713,9 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, if verify_integrity: self._verify_integrity(codes=new_codes) + else: + self._codes = new_codes - self._codes = new_codes self._tuples = None self._reset_cache() diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 11dd5a7abcde8..be3cc934667fd 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -90,16 +90,30 @@ def test_constructor_mismatched_codes_levels(idx): def test_na_levels(): - tm.assert_index_equal( - MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]), - MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[-1, -1, -1, -1, 3, 4]])) - tm.assert_index_equal( - MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[0, -1, 1, 2, 3, 4]]), - MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[-1, -1, 1, -1, 3, -1]])) + # GH26408 + result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]]) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( + [[np.nan, 's', pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[1, 2, 2, 2, 2, 2]]).set_codes( + [[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) def test_labels_deprecated(idx): From 933615b59b4d1430fde1092279dfda4c4e67842a Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Mon, 20 May 2019 09:30:25 +0800 Subject: [PATCH 06/17] move multiIndex bug to api breaking changes --- doc/source/whatsnew/v0.25.0.rst | 36 ++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c8c97c4ff82df..c78de3fd1e155 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -77,6 +77,40 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + +.. _whatsnew_0250.api_breaking.multi_indexing: + + +MultiIndexing contracted from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +would be reassigned as -1. (:issue:`19387`) + +.. ipython:: python + + pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) + pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + +*Previous Behavior*: + +.. code-block:: ipython + + MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) + +*New Behavior*: + +.. ipython:: python + + MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + ValueError: On level 0, code value (-2) < -1 + + .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: GroupBy.apply on ``DataFrame`` evaluates first group only once @@ -387,7 +421,7 @@ MultiIndex ^^^^^^^^^^ - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) -- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows code values < -1 or NaN levels (:issue:`19387`) +- I/O ^^^ From d4227c64f09dfaa86a1b28b42861f97cdaa085eb Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Mon, 20 May 2019 10:42:41 +0800 Subject: [PATCH 07/17] refactor validate codes and add in doc-strings --- pandas/core/indexes/multi.py | 57 +++++++++++++------ .../tests/indexes/multi/test_constructor.py | 4 +- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7edddde5a8919..cbd01ca252373 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -242,14 +242,30 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result.sortorder = sortorder if verify_integrity: - result._verify_integrity() + new_codes = result._verify_integrity() + result._codes = new_codes if _set_identity: result._reset_identity() return result - def _validate_codes(cls, level, code): + def _validate_codes(self, level, code): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : optional list + Code to reassign. + level : optional list + Level to check for Nan. + + Returns + ------- + code : new code where code value = -1 if it corresponds + to a NaN level. + """ null_mask = isna(level) if np.any(null_mask): code = np.where(null_mask[code], -1, code) @@ -270,6 +286,11 @@ def _verify_integrity(self, codes=None, levels=None): ValueError If length of levels and codes don't match, if the codes for any level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + codes : new codes where code value = -1 if it corresponds to a + NaN level. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -279,18 +300,18 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.codes[0]) + codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level {level}, code max ({max_code})" - " >= length of level ({level_len}). " - "NOTE: this index is in an inconsistent" - " state".format( - level=i, max_code=level_codes.max(), - level_len=len(level))) + msg = ("On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) + raise ValueError(msg) if len(level_codes) and level_codes.min() < -1: raise ValueError("On level {level}, code value ({code})" " < -1".format( @@ -301,9 +322,9 @@ def _verify_integrity(self, codes=None, levels=None): values=[value for value in level], level=i)) - codes = [self._validate_codes(level, code) - for level, code in zip(levels, codes)] - self._set_codes(codes, validate=False) + codes = FrozenList([self._validate_codes(level, code) + for level, code in zip(levels, codes)]) + return codes @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): @@ -603,7 +624,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, new_levels = FrozenList(new_levels) if verify_integrity: - self._verify_integrity(levels=new_levels) + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes names = self.names self._levels = new_levels @@ -711,10 +733,8 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, level_codes, lev, copy=copy)._shallow_copy() new_codes = FrozenList(new_codes) - if verify_integrity: - self._verify_integrity(codes=new_codes) - else: - self._codes = new_codes + new_codes = self._verify_integrity(codes=new_codes) + self._codes = new_codes self._tuples = None self._reset_cache() @@ -1778,9 +1798,10 @@ def __setstate__(self, state): self._set_levels([Index(x) for x in levels], validate=False) self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) self._set_names(names) self.sortorder = sortorder - self._verify_integrity() self._reset_identity() def __getitem__(self, key): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index be3cc934667fd..b8959bb139732 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -64,10 +64,10 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." + length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" - code_value_error = (r"On level 0, code value \(-2\) < -1") + code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): From b66d3a48600b58af06f1f0dff122f6079e7ab662 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Tue, 21 May 2019 10:59:43 +0800 Subject: [PATCH 08/17] update whatsnew record --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c78de3fd1e155..fb4ef1dcb3ed9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -81,8 +81,8 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) .. _whatsnew_0250.api_breaking.multi_indexing: -MultiIndexing contracted from levels and codes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +MultiIndex constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes From 81ebfda2bb66faa7ee2a7d279acb474b22db81a1 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Tue, 21 May 2019 12:42:36 +0800 Subject: [PATCH 09/17] add ensure frozen for codes --- pandas/core/indexes/multi.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cbd01ca252373..e2278591a0992 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -322,9 +322,12 @@ def _verify_integrity(self, codes=None, levels=None): values=[value for value in level], level=i)) - codes = FrozenList([self._validate_codes(level, code) - for level, code in zip(levels, codes)]) - return codes + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + new_codes = FrozenList( + _ensure_frozen(level_codes, lev, copy=False)._shallow_copy() + for lev, level_codes in zip(levels, codes)) + return new_codes @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): From 7dfd6982a4937cbf301a54362c529a669b070e71 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Tue, 21 May 2019 13:46:30 +0800 Subject: [PATCH 10/17] update whatsnew code display --- doc/source/whatsnew/v0.25.0.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fb4ef1dcb3ed9..5fac0c13ca5ea 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -97,18 +97,18 @@ would be reassigned as -1. (:issue:`19387`) .. code-block:: ipython - MultiIndex(levels=[[nan, None, NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) - MultiIndex(levels=[[1, 2]], - codes=[[0, -2]]) + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + Out[2]: MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) *New Behavior*: -.. ipython:: python +.. ipython:: ipython - MultiIndex(levels=[[nan, None, NaT, 128, 2]], + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], codes=[[-1, -1, -1, -1, 3, 4]]) - ValueError: On level 0, code value (-2) < -1 + Out[2]: ValueError: On level 0, code value (-2) < -1 .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: From ec265fad1adaa47e8ed61faf7a012749421c739a Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Wed, 22 May 2019 09:29:02 +0800 Subject: [PATCH 11/17] update whatsnew codeblocks --- doc/source/whatsnew/v0.25.0.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5fac0c13ca5ea..82e54f3cd4a85 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -90,25 +90,26 @@ would be reassigned as -1. (:issue:`19387`) .. ipython:: python - pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) - pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) + mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) *Previous Behavior*: .. code-block:: ipython + In [1]: mi1 Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) + In [2]: mi2 Out[2]: MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) *New Behavior*: -.. ipython:: ipython +.. ipython:: python - Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], - codes=[[-1, -1, -1, -1, 3, 4]]) - Out[2]: ValueError: On level 0, code value (-2) < -1 + mi1 + mi2 .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: From af35fa32845a53eee3f842563144fa30e1bc33b4 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 23 May 2019 09:01:49 +0800 Subject: [PATCH 12/17] shorten lines for whatsnew --- doc/source/whatsnew/v0.25.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 82e54f3cd4a85..71f427c3268e7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -90,7 +90,8 @@ would be reassigned as -1. (:issue:`19387`) .. ipython:: python - mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) *Previous Behavior*: From 6ec67b3004433d7842e08f5551ff6ee7272a1dad Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Mon, 27 May 2019 10:34:35 +0800 Subject: [PATCH 13/17] remove shallow copy and update function descirptor --- pandas/core/indexes/multi.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e2278591a0992..6f79b4ad10716 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -256,10 +256,10 @@ def _validate_codes(self, level, code): Parameters ---------- - code : optional list + code : list, optional Code to reassign. - level : optional list - Level to check for Nan. + level : list, optional + Level to check for missing values (NaN, NaT, None). Returns ------- @@ -324,9 +324,7 @@ def _verify_integrity(self, codes=None, levels=None): codes = [self._validate_codes(level, code) for level, code in zip(levels, codes)] - new_codes = FrozenList( - _ensure_frozen(level_codes, lev, copy=False)._shallow_copy() - for lev, level_codes in zip(levels, codes)) + new_codes = FrozenList(codes) return new_codes @classmethod From 28c953f1dfb7ddd25e1ec2b06d437f9a845ad93b Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Wed, 29 May 2019 16:15:27 +0800 Subject: [PATCH 14/17] remove redundant validation of code --- pandas/core/indexes/multi.py | 8 +++++--- pandas/tests/indexes/multi/test_constructor.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6f79b4ad10716..db0826c7c1ba9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -256,9 +256,9 @@ def _validate_codes(self, level, code): Parameters ---------- - code : list, optional + code : list Code to reassign. - level : list, optional + level : list Level to check for missing values (NaN, NaT, None). Returns @@ -734,7 +734,9 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, level_codes, lev, copy=copy)._shallow_copy() new_codes = FrozenList(new_codes) - new_codes = self._verify_integrity(codes=new_codes) + if verify_integrity: + new_codes = self._verify_integrity(codes=new_codes) + self._codes = new_codes self._tuples = None diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b8959bb139732..a8074f5e915bd 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -91,6 +91,7 @@ def test_constructor_mismatched_codes_levels(idx): def test_na_levels(): # GH26408 + # test if codes are re-assigned value -1 for levels with na values result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], From 055999de4d413d41f2b13170c8bd2e7efbc97b6a Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Wed, 29 May 2019 18:07:24 +0800 Subject: [PATCH 15/17] add explanatory comments and add type annotation --- pandas/core/indexes/multi.py | 4 ++-- pandas/tests/indexes/multi/test_constructor.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index db0826c7c1ba9..6dc0aedfdec06 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -250,7 +250,7 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, return result - def _validate_codes(self, level, code): + def _validate_codes(self, level: list, code: list): """ Reassign code values as -1 if their corresponding levels are NaN. @@ -264,7 +264,7 @@ def _validate_codes(self, level, code): Returns ------- code : new code where code value = -1 if it corresponds - to a NaN level. + to a level with missing values (NaN, NaT, None). """ null_mask = isna(level) if np.any(null_mask): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index a8074f5e915bd..156d408338590 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -91,7 +91,8 @@ def test_constructor_mismatched_codes_levels(idx): def test_na_levels(): # GH26408 - # test if codes are re-assigned value -1 for levels with na values + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], From d9db853a05a817d4e7709ce0d1169e0ec2f3c12a Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Wed, 29 May 2019 18:23:58 +0800 Subject: [PATCH 16/17] add test for verify _integrity=False --- pandas/tests/indexes/multi/test_constructor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 156d408338590..197c06aacf90e 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -84,6 +84,11 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], + verify_integrity=False) + # code value smaller than -1 with pytest.raises(ValueError, match=code_value_error): MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) From a141e4041cca521a31c09ab5a1c03f409bc0ce42 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Fri, 31 May 2019 12:50:04 +0800 Subject: [PATCH 17/17] add dropna tests --- pandas/tests/indexes/multi/test_missing.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index ed90f74d80989..518c12bb20e13 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -73,6 +73,21 @@ def test_dropna(): with pytest.raises(ValueError, match=msg): idx.dropna(how='xxx') + # GH26408 + # test if missing values are dropped for mutiindex constructed + # from codes and values + idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], + [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], + [0, -1, 3, 3, 3, 4]]) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how='any'), expected) + + expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], + ["128", "128", "128", 2]]) + tm.assert_index_equal(idx.dropna(how='all'), expected) + def test_nulls(idx): # this is really a smoke test for the methods