Skip to content

Commit

Permalink
BUG: Check integrity of sparse int indices
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Apr 2, 2017
1 parent d1e1ba0 commit c6e3e9c
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 7 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ Indexing
^^^^^^^^

- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
- Bug in sparse array indexing in which indices were not being validated (:issue:`15863`)
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`)
Expand Down
48 changes: 41 additions & 7 deletions pandas/sparse/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ cdef inline int int_min(int a, int b): return a if a <= b else b

cdef class SparseIndex:
"""
Abstract superclass for sparse index types
Abstract superclass for sparse index types.
"""

def __init__(self):
raise NotImplementedError

Expand All @@ -48,8 +49,9 @@ cdef class IntIndex(SparseIndex):
----------
length : integer
indices : array-like
Contains integers corresponding to
Contains integers corresponding to the indices.
"""

cdef readonly:
Py_ssize_t length, npoints
ndarray indices
Expand All @@ -59,9 +61,11 @@ cdef class IntIndex(SparseIndex):
self.indices = np.ascontiguousarray(indices, dtype=np.int32)
self.npoints = len(self.indices)

self.check_integrity()

def __reduce__(self):
args = (self.length, self.indices)
return (IntIndex, args)
return IntIndex, args

def __repr__(self):
output = 'IntIndex\n'
Expand All @@ -70,10 +74,40 @@ cdef class IntIndex(SparseIndex):

def check_integrity(self):
"""
Only need be strictly ascending and nothing less than 0 or greater than
total length
Checks the following:
- Indices are strictly ascending
- Number of indices is at most self.length
- Indices are at least 0 and at most the total length less one
A ValueError is raised if any of these conditions is violated.
"""
pass

cdef:
int32_t index, prev = -1

if self.npoints > self.length:
msg = ("Too many indices. Expected "
"{exp} but found {act}").format(
exp=self.length, act=self.npoints)
raise ValueError(msg)

# Indices are vacuously ordered and non-negative
# if the sequence of indices is empty.
if self.npoints == 0:
return

if min(self.indices) < 0:
raise ValueError("No index can be less than zero")

if max(self.indices) >= self.length:
raise ValueError("All indices must be less than the length")

for index in self.indices:
if prev != -1 and index <= prev:
raise ValueError("Indices must be strictly increasing")

prev = index

def equals(self, other):
if not isinstance(other, IntIndex):
Expand Down Expand Up @@ -320,7 +354,7 @@ cdef class BlockIndex(SparseIndex):

def __reduce__(self):
args = (self.length, self.blocs, self.blengths)
return (BlockIndex, args)
return BlockIndex, args

def __repr__(self):
output = 'BlockIndex\n'
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_libsparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,44 @@ def test_to_block_index(self):

class TestIntIndex(tm.TestCase):

def test_check_integrity(self):

# Too many indices than specified in self.length
msg = "Too many indices"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=1, indices=[1, 2, 3])

# No index can be negative.
msg = "No index can be less than zero"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, -2, 3])

# No index can be negative.
msg = "No index can be less than zero"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, -2, 3])

# All indices must be less than the length.
msg = "All indices must be less than the length"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 2, 5])

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 2, 6])

# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 3, 2])

with tm.assertRaisesRegexp(ValueError, msg):
IntIndex(length=5, indices=[1, 3, 3])

def test_int_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer')
self.assertIsInstance(idx, IntIndex)
Expand Down

0 comments on commit c6e3e9c

Please sign in to comment.