-
-
Notifications
You must be signed in to change notification settings - Fork 17.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: MultiIndex not dropping nan level and invalid code value #26408
Changes from 7 commits
e0d3bb4
af2ad91
248177b
b6d29bd
041b4cd
933615b
d4227c6
b66d3a4
81ebfda
7dfd698
ec265fa
af35fa3
6ec67b3
28c953f
055999d
d9db853
a141e40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -242,11 +242,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, | |
result.sortorder = sortorder | ||
|
||
if verify_integrity: | ||
result._verify_integrity() | ||
new_codes = result._verify_integrity() | ||
result._codes = new_codes | ||
|
||
if _set_identity: | ||
result._reset_identity() | ||
|
||
return result | ||
|
||
def _validate_codes(self, level, code): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you try to type annotate level and code here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure about type annotation, could you please point me to any example in the code base? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure we have started generic codes in pandas._typing (see how they are used); this can be a followup |
||
""" | ||
Reassign code values as -1 if their corresponding levels are NaN. | ||
|
||
Parameters | ||
---------- | ||
code : optional list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Code to reassign. | ||
level : optional list | ||
Level to check for Nan. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nan -> missing value (NaN, NaT, None) |
||
|
||
Returns | ||
------- | ||
code : new code where code value = -1 if it corresponds | ||
to a NaN level. | ||
""" | ||
null_mask = isna(level) | ||
if np.any(null_mask): | ||
code = np.where(null_mask[code], -1, code) | ||
return code | ||
|
||
def _verify_integrity(self, codes=None, levels=None): | ||
""" | ||
|
||
|
@@ -262,6 +286,11 @@ def _verify_integrity(self, codes=None, levels=None): | |
ValueError | ||
If length of levels and codes don't match, if the codes for any | ||
level would exceed level bounds, or there are any duplicate levels. | ||
|
||
Returns | ||
------- | ||
codes : new codes where code value = -1 if it corresponds to a | ||
NaN level. | ||
""" | ||
# NOTE: Currently does not check, among other things, that cached | ||
# nlevels matches nor that sortorder matches actually sortorder. | ||
|
@@ -271,22 +300,32 @@ def _verify_integrity(self, codes=None, levels=None): | |
if len(levels) != len(codes): | ||
raise ValueError("Length of levels and codes must match. NOTE:" | ||
" this index is in an inconsistent state.") | ||
codes_length = len(self.codes[0]) | ||
codes_length = len(codes[0]) | ||
for i, (level, level_codes) in enumerate(zip(levels, codes)): | ||
if len(level_codes) != codes_length: | ||
raise ValueError("Unequal code lengths: %s" % | ||
([len(code_) for code_ in codes])) | ||
if len(level_codes) and level_codes.max() >= len(level): | ||
raise ValueError("On level %d, code max (%d) >= length of" | ||
" level (%d). NOTE: this index is in an" | ||
" inconsistent state" % (i, level_codes.max(), | ||
len(level))) | ||
msg = ("On level {level}, code max ({max_code}) >= length of " | ||
"level ({level_len}). NOTE: this index is in an " | ||
"inconsistent state".format( | ||
level=i, max_code=level_codes.max(), | ||
level_len=len(level))) | ||
raise ValueError(msg) | ||
if len(level_codes) and level_codes.min() < -1: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError("On level {level}, code value ({code})" | ||
" < -1".format( | ||
level=i, code=level_codes.min())) | ||
if not level.is_unique: | ||
raise ValueError("Level values must be unique: {values} on " | ||
"level {level}".format( | ||
values=[value for value in level], | ||
level=i)) | ||
|
||
codes = FrozenList([self._validate_codes(level, code) | ||
for level, code in zip(levels, codes)]) | ||
return codes | ||
|
||
@classmethod | ||
def from_arrays(cls, arrays, sortorder=None, names=None): | ||
""" | ||
|
@@ -585,7 +624,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, | |
new_levels = FrozenList(new_levels) | ||
|
||
if verify_integrity: | ||
self._verify_integrity(levels=new_levels) | ||
new_codes = self._verify_integrity(levels=new_levels) | ||
self._codes = new_codes | ||
|
||
names = self.names | ||
self._levels = new_levels | ||
|
@@ -675,7 +715,6 @@ def labels(self): | |
|
||
def _set_codes(self, codes, level=None, copy=False, validate=True, | ||
verify_integrity=False): | ||
|
||
if validate and level is None and len(codes) != self.nlevels: | ||
raise ValueError("Length of codes must match number of levels") | ||
if validate and level is not None and len(codes) != len(level): | ||
|
@@ -694,10 +733,9 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, | |
level_codes, lev, copy=copy)._shallow_copy() | ||
new_codes = FrozenList(new_codes) | ||
|
||
if verify_integrity: | ||
self._verify_integrity(codes=new_codes) | ||
|
||
new_codes = self._verify_integrity(codes=new_codes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this only be called if verfiy_integrity? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, changed |
||
self._codes = new_codes | ||
|
||
self._tuples = None | ||
self._reset_cache() | ||
|
||
|
@@ -1760,9 +1798,10 @@ def __setstate__(self, state): | |
|
||
self._set_levels([Index(x) for x in levels], validate=False) | ||
self._set_codes(codes) | ||
new_codes = self._verify_integrity() | ||
self._set_codes(new_codes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we end up calling verify_integrity twice? here (and in _set_codes)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that the set_codes are called with default verify_integrity=False, verification would not be called twice |
||
self._set_names(names) | ||
self.sortorder = sortorder | ||
self._verify_integrity() | ||
self._reset_identity() | ||
|
||
def __getitem__(self, key): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,9 +64,10 @@ def test_constructor_mismatched_codes_levels(idx): | |
with pytest.raises(ValueError, match=msg): | ||
MultiIndex(levels=levels, codes=codes) | ||
|
||
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." | ||
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." | ||
" NOTE: this index is in an inconsistent state") | ||
label_error = r"Unequal code lengths: \[4, 2\]" | ||
code_value_error = r"On level 0, code value \(-2\) < -1" | ||
|
||
# important to check that it's looking at the right thing. | ||
with pytest.raises(ValueError, match=length_error): | ||
|
@@ -83,6 +84,37 @@ def test_constructor_mismatched_codes_levels(idx): | |
with pytest.raises(ValueError, match=label_error): | ||
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) | ||
|
||
# code value smaller than -1 | ||
with pytest.raises(ValueError, match=code_value_error): | ||
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) | ||
|
||
|
||
def test_na_levels(): | ||
# GH26408 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[-1, -1, -1, -1, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[-1, -1, 1, -1, 3, -1]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# verify set_levels and set_codes | ||
result = MultiIndex( | ||
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( | ||
[[np.nan, 's', pd.NaT, 128, None]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you replace the actual OP tests as well (e.g. the .dropna()); I don't think they are very different that what you have, but they also attempt to .dropna() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dropna tests added, and green now. |
||
levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[1, 2, 2, 2, 2, 2]]).set_codes( | ||
[[0, -1, 1, 2, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_labels_deprecated(idx): | ||
# GH23752 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think "contracted" is the right word. Maybe "Changed allowed codes in MultiIndex"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed the typo in whatsnew, I think I meant constructed