ENH: factorize + groupsort to massively improve multi-column sort_ind…

…ex performance. Add vbench test case, GH #555
pandas-dev · Jan 10, 2012 · 18ca639 · 18ca639
1 parent abb9422
commit 18ca639
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 8 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -57,6 +57,7 @@ pandas 0.7.0
   - Can pass multiple DataFrames to ``DataFrame.join`` to join on index (GH #115)
   - Can pass multiple Panels to ``Panel.join`` (GH #115)
   - Can pass multiple DataFrames to `DataFrame.append` to concatenate (stack)
+    and multiple Series to ``Series.append`` too
 
 **API Changes**
 
@@ -107,6 +108,8 @@ pandas 0.7.0
   - Made ``Index._get_duplicates`` a public method by removing the underscore
   - Prettier printing of floats, and column spacing fix (GH #395, GH #571)
   - Add ``bold_rows`` option to DataFrame.to_html (GH #586)
+  - Improve the performance of ``DataFrame.sort_index`` by up to 5x or more
+    when sorting by multiple columns (GH #555)
 
 **Bug fixes**
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1965,11 +1965,10 @@ def sort_index(self, axis=0, by=None, ascending=True):
         if by is not None:
             assert(axis == 0)
             if isinstance(by, (tuple, list)):
-                to_sort = lib.fast_zip([self[x] for x in by])
+                keys = [self[x].values for x in by]
+                indexer = _lexsort_indexer(keys)
             else:
-                to_sort = self[by].values
-
-            indexer = to_sort.argsort()
+                indexer = self[by].values.argsort()
         else:
             indexer = labels.argsort()
 
@@ -2800,7 +2799,7 @@ def append(self, other, ignore_index=False, verify_integrity=True):
             return other.copy()
 
         from pandas.tools.merge import concat
-        if isinstance(other, list):
+        if isinstance(other, (list, tuple)):
             to_concat = [self] + other
         else:
             to_concat = [self, other]
@@ -3856,6 +3855,26 @@ def complete_dataframe(obj, prev_completions):
     except Exception:
         pass
 
+def _lexsort_indexer(keys):
+    from pandas.core.groupby import get_group_index, _compress_group_index
+
+    labels = []
+    shape = []
+    for key in keys:
+        rizer = lib.Factorizer(len(key))
+
+        if not key.dtype == np.object_:
+            key = key.astype('O')
+
+        ids, _ = rizer.factorize(key, sort=True)
+        labels.append(ids)
+        shape.append(len(rizer.uniques))
+
+    group_index = get_group_index(labels, shape)
+    comp_ids, _, max_group = _compress_group_index(group_index)
+    indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
+    return indexer
+
 
 if __name__ == '__main__':
     import nose

diff --git a/pandas/src/hashtable.pyx b/pandas/src/hashtable.pyx
@@ -693,9 +693,9 @@ cdef class PyObjectHashTable:
         return uniques
 
     def factorize(self, ndarray[object] values):
-        reverse = {}
-        labels, counts = self.get_labels(values, reverse, 0)
-        return reverse, labels, counts
+        uniques = []
+        labels, counts = self.get_labels(values, uniques, 0)
+        return labels, counts, uniques # reverse, labels, counts
 
     cpdef get_labels(self, ndarray[object] values, list uniques,
                      Py_ssize_t count_prior):

diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py
@@ -108,3 +108,23 @@ def backfill():
               name='reindex_frame_level_reindex',
               start_date=datetime(2011, 12, 27))
 
+
+#----------------------------------------------------------------------
+# sort_index
+
+# pathological, but realistic
+setup = common_setup + """
+N = 10000
+K = 10
+
+key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
+key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
+
+df = DataFrame({'key1' : key1, 'key2' : key2,
+                'value' : np.random.randn(N * K)})
+"""
+statement = "df.sort_index(by=['key1', 'key2'])"
+frame_sort_index_by_columns = Benchmark(statement, setup,
+                                        name='frame_sort_index_by_columns',
+                                        start_date=datetime(2011, 11, 1))
+