Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Check integrity of sparse int indices #15863

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ Indexing
^^^^^^^^

- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
- Bug in sparse array indexing in which indices were not being validated (:issue:`15863`)
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`)
Expand Down
48 changes: 41 additions & 7 deletions pandas/sparse/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ cdef inline int int_min(int a, int b): return a if a <= b else b

cdef class SparseIndex:
"""
Abstract superclass for sparse index types
Abstract superclass for sparse index types.
"""

def __init__(self):
raise NotImplementedError

Expand All @@ -48,8 +49,9 @@ cdef class IntIndex(SparseIndex):
----------
length : integer
indices : array-like
Contains integers corresponding to
Contains integers corresponding to the indices.
"""

cdef readonly:
Py_ssize_t length, npoints
ndarray indices
Expand All @@ -59,9 +61,11 @@ cdef class IntIndex(SparseIndex):
self.indices = np.ascontiguousarray(indices, dtype=np.int32)
self.npoints = len(self.indices)

self.check_integrity()

def __reduce__(self):
args = (self.length, self.indices)
return (IntIndex, args)
return IntIndex, args

def __repr__(self):
output = 'IntIndex\n'
Expand All @@ -70,10 +74,40 @@ cdef class IntIndex(SparseIndex):

def check_integrity(self):
"""
Only need be strictly ascending and nothing less than 0 or greater than
total length
Checks the following:

- Indices are strictly ascending
- Number of indices is at most self.length
- Indices are at least 0 and at most the total length less one

A ValueError is raised if any of these conditions is violated.
"""
pass

cdef:
int32_t index, prev = -1

if self.npoints > self.length:
msg = ("Too many indices. Expected "
"{exp} but found {act}").format(
exp=self.length, act=self.npoints)
raise ValueError(msg)

# Indices are vacuously ordered and non-negative
# if the sequence of indices is empty.
if self.npoints == 0:
return

if min(self.indices) < 0:
raise ValueError("No index can be less than zero")

if max(self.indices) >= self.length:
raise ValueError("All indices must be less than the length")

for index in self.indices:
if prev != -1 and index <= prev:
raise ValueError("Indices must be strictly increasing")

prev = index

def equals(self, other):
if not isinstance(other, IntIndex):
Expand Down Expand Up @@ -320,7 +354,7 @@ cdef class BlockIndex(SparseIndex):

def __reduce__(self):
args = (self.length, self.blocs, self.blengths)
return (BlockIndex, args)
return BlockIndex, args

def __repr__(self):
output = 'BlockIndex\n'
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_libsparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,44 @@ def test_to_block_index(self):

class TestIntIndex(tm.TestCase):

def test_check_integrity(self):

# Too many indices than specified in self.length
msg = "Too many indices"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=1, indices=[1, 2, 3])

# No index can be negative.
msg = "No index can be less than zero"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, -2, 3])

# No index can be negative.
msg = "No index can be less than zero"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, -2, 3])

# All indices must be less than the length.
msg = "All indices must be less than the length"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 2, 5])

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 2, 6])

# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 3, 2])

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 3, 3])

def test_int_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer')
self.assertIsInstance(idx, IntIndex)
Expand Down