Skip to content

Commit

Permalink
ENH: factorize + groupsort to massively improve multi-column sort_ind…
Browse files Browse the repository at this point in the history
…ex performance. Add vbench test case, GH #555
  • Loading branch information
wesm committed Jan 10, 2012
1 parent abb9422 commit 18ca639
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 8 deletions.
3 changes: 3 additions & 0 deletions RELEASE.rst
Expand Up @@ -57,6 +57,7 @@ pandas 0.7.0
- Can pass multiple DataFrames to ``DataFrame.join`` to join on index (GH #115)
- Can pass multiple Panels to ``Panel.join`` (GH #115)
- Can pass multiple DataFrames to `DataFrame.append` to concatenate (stack)
and multiple Series to ``Series.append`` too

**API Changes**

Expand Down Expand Up @@ -107,6 +108,8 @@ pandas 0.7.0
- Made ``Index._get_duplicates`` a public method by removing the underscore
- Prettier printing of floats, and column spacing fix (GH #395, GH #571)
- Add ``bold_rows`` option to DataFrame.to_html (GH #586)
- Improve the performance of ``DataFrame.sort_index`` by up to 5x or more
when sorting by multiple columns (GH #555)

**Bug fixes**

Expand Down
29 changes: 24 additions & 5 deletions pandas/core/frame.py
Expand Up @@ -1965,11 +1965,10 @@ def sort_index(self, axis=0, by=None, ascending=True):
if by is not None:
assert(axis == 0)
if isinstance(by, (tuple, list)):
to_sort = lib.fast_zip([self[x] for x in by])
keys = [self[x].values for x in by]
indexer = _lexsort_indexer(keys)
else:
to_sort = self[by].values

indexer = to_sort.argsort()
indexer = self[by].values.argsort()
else:
indexer = labels.argsort()

Expand Down Expand Up @@ -2800,7 +2799,7 @@ def append(self, other, ignore_index=False, verify_integrity=True):
return other.copy()

from pandas.tools.merge import concat
if isinstance(other, list):
if isinstance(other, (list, tuple)):
to_concat = [self] + other
else:
to_concat = [self, other]
Expand Down Expand Up @@ -3856,6 +3855,26 @@ def complete_dataframe(obj, prev_completions):
except Exception:
pass

def _lexsort_indexer(keys):
from pandas.core.groupby import get_group_index, _compress_group_index

labels = []
shape = []
for key in keys:
rizer = lib.Factorizer(len(key))

if not key.dtype == np.object_:
key = key.astype('O')

ids, _ = rizer.factorize(key, sort=True)
labels.append(ids)
shape.append(len(rizer.uniques))

group_index = get_group_index(labels, shape)
comp_ids, _, max_group = _compress_group_index(group_index)
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
return indexer


if __name__ == '__main__':
import nose
Expand Down
6 changes: 3 additions & 3 deletions pandas/src/hashtable.pyx
Expand Up @@ -693,9 +693,9 @@ cdef class PyObjectHashTable:
return uniques

def factorize(self, ndarray[object] values):
reverse = {}
labels, counts = self.get_labels(values, reverse, 0)
return reverse, labels, counts
uniques = []
labels, counts = self.get_labels(values, uniques, 0)
return labels, counts, uniques # reverse, labels, counts

cpdef get_labels(self, ndarray[object] values, list uniques,
Py_ssize_t count_prior):
Expand Down
20 changes: 20 additions & 0 deletions vb_suite/reindex.py
Expand Up @@ -108,3 +108,23 @@ def backfill():
name='reindex_frame_level_reindex',
start_date=datetime(2011, 12, 27))


#----------------------------------------------------------------------
# sort_index

# pathological, but realistic
setup = common_setup + """
N = 10000
K = 10
key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
df = DataFrame({'key1' : key1, 'key2' : key2,
'value' : np.random.randn(N * K)})
"""
statement = "df.sort_index(by=['key1', 'key2'])"
frame_sort_index_by_columns = Benchmark(statement, setup,
name='frame_sort_index_by_columns',
start_date=datetime(2011, 11, 1))

0 comments on commit 18ca639

Please sign in to comment.