Skip to content

Commit

Permalink
Merge pull request #8172 from behzadnouri/lj-cnt-sort
Browse files Browse the repository at this point in the history
counting sort instead of np.argsort in left outer join
  • Loading branch information
jreback committed Sep 6, 2014
2 parents 3ecb760 + f4768cc commit 7800290
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
4 changes: 2 additions & 2 deletions pandas/src/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,15 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
if not sort: # if not asked to sort, revert to original order
if len(left) == len(left_indexer):
# no multiple matches for any row on the left
# this is a short-cut to avoid np.argsort;
# this is a short-cut to avoid groupsort_indexer
# otherwise, the `else` path also works in this case
if left_sorter.dtype != np.int_:
left_sorter = left_sorter.astype(np.int_)

rev = np.empty(len(left), dtype=np.int_)
rev.put(left_sorter, np.arange(len(left)))
else:
rev = np.argsort(left_indexer)
rev, _ = groupsort_indexer(left_indexer, len(left))

right_indexer = right_indexer.take(rev)
left_indexer = left_indexer.take(rev)
Expand Down
13 changes: 13 additions & 0 deletions vb_suite/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,16 @@ def sample(values, k):
join_non_unique_equal = Benchmark('fracofday * temp[fracofday.index]', setup,
start_date=datetime(2013, 1, 1))


setup = common_setup + '''
np.random.seed(2718281)
n = 50000
left = DataFrame(np.random.randint(1, n/500, (n, 2)),
columns=['jim', 'joe'])
right = DataFrame(np.random.randint(1, n/500, (n, 2)),
columns=['jolie', 'jolia']).set_index('jolie')
'''

left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)

0 comments on commit 7800290

Please sign in to comment.