Skip to content

Commit

Permalink
PERF: Sparse IntIndex.make_union / Numeric ops
Browse files Browse the repository at this point in the history
Author: sinhrks <sinhrks@gmail.com>

Closes #13036 from sinhrks/sparse_make_union and squashes the following commits:

b1cf4b5 [sinhrks] PERF: Sparse IntIndex.make_union
  • Loading branch information
sinhrks authored and jreback committed Apr 30, 2016
1 parent 3e9c320 commit 3ff5af0
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 151 deletions.
41 changes: 40 additions & 1 deletion asv_bench/benchmarks/sparse.py
Expand Up @@ -52,4 +52,43 @@ def setup(self):
self.ss = self.s.to_sparse()

def time_sparse_series_to_coo(self):
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)


class sparse_arithmetic(object):
goal_time = 0.2

def setup(self):
np.random.seed(1)
self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)
self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)

self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)
self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)

self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)
self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)

def make_sparse_array(self, length, dense_size, fill_value):
arr = np.array([fill_value] * length, dtype=np.float64)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.randint(0, 100, len(indexer))
return pd.SparseArray(arr, fill_value=fill_value)

def time_sparse_addition_10percent(self):
self.a_10percent + self.b_10percent

def time_sparse_addition_10percent_zero(self):
self.a_10percent_zero + self.b_10percent_zero

def time_sparse_addition_1percent(self):
self.a_1percent + self.b_1percent

def time_sparse_division_10percent(self):
self.a_10percent / self.b_10percent

def time_sparse_division_10percent_zero(self):
self.a_10percent_zero / self.b_10percent_zero

def time_sparse_division_1percent(self):
self.a_1percent / self.b_1percent
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Expand Up @@ -507,6 +507,7 @@ Performance Improvements
- Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`).
- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`)
- Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`)
- Improved performance of sparse arithmetic with ``IntIndex`` (:issue:`13036`)



Expand Down
252 changes: 141 additions & 111 deletions pandas/sparse/tests/test_libsparse.py
Expand Up @@ -43,117 +43,147 @@ def _check_case_dict(case):
_check_case([], [], [], [], [], [])


def test_index_make_union():
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
bresult = xindex.make_union(yindex)
assert (isinstance(bresult, BlockIndex))
assert_equal(bresult.blocs, eloc)
assert_equal(bresult.blengths, elen)

ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert (isinstance(iresult, IntIndex))
assert_equal(iresult.indices, bresult.to_int_index().indices)

"""
x: ----
y: ----
r: --------
"""
xloc = [0]
xlen = [5]
yloc = [5]
ylen = [4]
eloc = [0]
elen = [9]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----- -----
y: ----- --
"""
xloc = [0, 10]
xlen = [5, 5]
yloc = [2, 17]
ylen = [5, 2]
eloc = [0, 10, 17]
elen = [7, 5, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------
y: -------
r: ----------
"""
xloc = [1]
xlen = [5]
yloc = [3]
ylen = [5]
eloc = [1]
elen = [7]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: -------
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4]
ylen = [8]
eloc = [2]
elen = [12]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: --- -----
y: -------
r: -------------
"""
xloc = [0, 5]
xlen = [3, 5]
yloc = [0]
ylen = [7]
eloc = [0]
elen = [10]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: ------- ---
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4, 13]
ylen = [8, 4]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----------------------
y: ---- ---- ---
r: ----------------------
"""
xloc = [2]
xlen = [15]
yloc = [4, 9, 14]
ylen = [3, 2, 2]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ---- ---
y: --- ---
"""
xloc = [0, 10]
xlen = [3, 3]
yloc = [5, 15]
ylen = [2, 2]
eloc = [0, 5, 10, 15]
elen = [3, 2, 3, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)

# TODO: different-length index objects
class TestSparseIndexUnion(tm.TestCase):

def test_index_make_union(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
bresult = xindex.make_union(yindex)
assert (isinstance(bresult, BlockIndex))
assert_equal(bresult.blocs, eloc)
assert_equal(bresult.blengths, elen)

ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert (isinstance(iresult, IntIndex))
assert_equal(iresult.indices, bresult.to_int_index().indices)

"""
x: ----
y: ----
r: --------
"""
xloc = [0]
xlen = [5]
yloc = [5]
ylen = [4]
eloc = [0]
elen = [9]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----- -----
y: ----- --
"""
xloc = [0, 10]
xlen = [5, 5]
yloc = [2, 17]
ylen = [5, 2]
eloc = [0, 10, 17]
elen = [7, 5, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------
y: -------
r: ----------
"""
xloc = [1]
xlen = [5]
yloc = [3]
ylen = [5]
eloc = [1]
elen = [7]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: -------
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4]
ylen = [8]
eloc = [2]
elen = [12]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: --- -----
y: -------
r: -------------
"""
xloc = [0, 5]
xlen = [3, 5]
yloc = [0]
ylen = [7]
eloc = [0]
elen = [10]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: ------- ---
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4, 13]
ylen = [8, 4]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----------------------
y: ---- ---- ---
r: ----------------------
"""
xloc = [2]
xlen = [15]
yloc = [4, 9, 14]
ylen = [3, 2, 2]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ---- ---
y: --- ---
"""
xloc = [0, 10]
xlen = [3, 3]
yloc = [5, 15]
ylen = [2, 2]
eloc = [0, 5, 10, 15]
elen = [3, 2, 3, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)

def test_intindex_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
self.assertTrue(res.equals(exp))

a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
self.assertTrue(res.equals(exp))

a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
self.assertTrue(res.equals(exp))

a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
self.assertTrue(res.equals(exp))

a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
with tm.assertRaises(ValueError):
a.make_union(b)


class TestSparseIndexCommon(tm.TestCase):
Expand Down
45 changes: 6 additions & 39 deletions pandas/src/sparse.pyx
Expand Up @@ -129,52 +129,19 @@ cdef class IntIndex(SparseIndex):
return IntIndex(self.length, new_list)

cpdef IntIndex make_union(self, SparseIndex y_):
cdef:
Py_ssize_t out_length, i, xi, yi
int32_t xind
ndarray[int32_t, ndim=1] xindices, yindices
list new_list = []
IntIndex x, y

x = self
cdef:
ndarray[int32_t, ndim=1] new_indices
IntIndex y

# if is one already, returns self
y = y_.to_int_index()

if self.length != y.length:
raise Exception('Indices must reference same underlying length')

xindices = self.indices
yindices = y.indices

xi = yi = 0
while True:
if xi == x.npoints:
while yi < y.npoints:
new_list.append(yindices[yi])
yi += 1
break
elif yi == y.npoints:
while xi < x.npoints:
new_list.append(xindices[xi])
xi += 1
break

xind = xindices[xi]
yind = yindices[yi]

if xind == yind:
new_list.append(xind)
xi += 1
yi += 1
elif xind < yind:
new_list.append(xind)
xi += 1
else:
new_list.append(yind)
yi += 1
raise ValueError('Indices must reference same underlying length')

return IntIndex(x.length, new_list)
new_indices = np.union1d(self.indices, y.indices)
return IntIndex(self.length, new_indices)

@cython.wraparound(False)
cpdef int lookup(self, Py_ssize_t index):
Expand Down

0 comments on commit 3ff5af0

Please sign in to comment.