Skip to content

Commit

Permalink
ENH: add sort option to DataFrame.join + vbench, GH #731
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Feb 5, 2012
1 parent 5d55410 commit 3cb301e
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 35 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ pandas 0.7.0
- Add ``isin`` method to Index objects, works just like ``Series.isin`` (GH
#657)
- Implement array interface on Panel so that ufuncs work (re: #740)
- Add ``sort`` option to ``DataFrame.join`` (GH #731)

**API Changes**

Expand Down
13 changes: 9 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3000,7 +3000,8 @@ def append(self, other, ignore_index=False, verify_integrity=True):
return concat(to_concat, ignore_index=ignore_index,
verify_integrity=verify_integrity)

def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
"""
Join columns with other DataFrame either on index or on a key
column. Efficiently Join multiple DataFrame objects by index at once by
Expand Down Expand Up @@ -3028,6 +3029,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
Suffix to use from left frame's overlapping columns
rsuffix : string
Suffix to use from right frame's overlapping columns
sort : boolean, default False
Order result DataFrame lexicographically by the join key. If False,
preserves the index order of the calling (left) DataFrame
Notes
-----
Expand All @@ -3040,9 +3044,10 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
"""
# For SparseDataFrame's benefit
return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
rsuffix=rsuffix)
rsuffix=rsuffix, sort=sort)

def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
from pandas.tools.merge import merge, concat

if isinstance(other, Series):
Expand All @@ -3052,7 +3057,7 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
if isinstance(other, DataFrame):
return merge(self, other, left_on=on, how=how,
left_index=on is None, right_index=True,
suffixes=(lsuffix, rsuffix), sort=False)
suffixes=(lsuffix, rsuffix), sort=sort)
else:
if on is not None:
raise ValueError('Joining multiple DataFrames only supported'
Expand Down
3 changes: 2 additions & 1 deletion pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,8 @@ def add_suffix(self, suffix):
f = ('%s' + ('%s' % suffix)).__mod__
return self.rename(columns=f)

def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
if on is not None:
raise NotImplementedError
else:
Expand Down
71 changes: 43 additions & 28 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,29 +112,14 @@ def _get_join_info(self):
join_index, left_indexer, right_indexer = \
left_ax.join(right_ax, how=self.how, return_indexers=True)
elif self.right_index and self.how == 'left':
join_index = left_ax
left_indexer = None

if len(self.left_join_keys) > 1:
assert(isinstance(right_ax, MultiIndex) and
len(self.left_join_keys) == right_ax.nlevels)

right_indexer = _get_multiindex_indexer(self.left_join_keys,
right_ax, sort=False)
else:
right_indexer = right_ax.get_indexer(self.left_join_keys[0])
join_index, left_indexer, right_indexer = \
_left_join_on_index(left_ax, right_ax, self.left_join_keys,
sort=self.sort)

elif self.left_index and self.how == 'right':
join_index = right_ax
right_indexer = None

if len(self.right_join_keys) > 1:
assert(isinstance(left_ax, MultiIndex) and
len(self.right_join_keys) == left_ax.nlevels)
left_indexer = _get_multiindex_indexer(self.right_join_keys,
left_ax, sort=False)
else:
left_indexer = left_ax.get_indexer(self.right_join_keys[0])
join_index, right_indexer, left_indexer = \
_left_join_on_index(right_ax, left_ax, self.right_join_keys,
sort=self.sort)
else:
# max groups = largest possible number of distinct groups
left_key, right_key, max_groups = self._get_group_keys()
Expand Down Expand Up @@ -307,16 +292,16 @@ def _get_keys(frame, on, drop=False):
return frame, keys, names


def _get_multiindex_indexer(join_keys, index, sort=True):
def _get_multiindex_indexer(join_keys, index, sort=False):
shape = []
labels = []
for level, key in zip(index.levels, join_keys):
llab, rlab, count = _factorize_objects(level, key, sort=False)
labels.append(rlab)
shape.append(count)

left_group_key = get_group_index(labels, shape) #.astype('i4')
right_group_key = get_group_index(index.labels, shape) #.astype('i4')
left_group_key = get_group_index(labels, shape)
right_group_key = get_group_index(index.labels, shape)

left_group_key, right_group_key, max_groups = \
_factorize_int64(left_group_key, right_group_key,
Expand All @@ -327,17 +312,47 @@ def _get_multiindex_indexer(join_keys, index, sort=True):
right_group_key.astype('i4'),
max_groups, sort=False)

return right_indexer
return left_indexer, right_indexer

def _get_single_indexer(join_key, index, sort=False):
left_key, right_key, count = _factorize_objects(join_key, index, sort=sort)

# after refactorizing, I don't think reordering is necessary
left_indexer, right_indexer = \
lib.left_outer_join(left_key.astype('i4'), right_key.astype('i4'),
count, sort=sort)

# NOW! reorder
#right_indexer.take(left_indexer.argsort())
return left_indexer, right_indexer

def _right_outer_join(x, y, max_groups):
right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups)
return left_indexer, right_indexer

def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
join_index = left_ax
left_indexer = None

if len(join_keys) > 1:
assert(isinstance(right_ax, MultiIndex) and
len(join_keys) == right_ax.nlevels)

left_tmp, right_indexer = \
_get_multiindex_indexer(join_keys, right_ax,
sort=sort)
if sort:
left_indexer = left_tmp
join_index = left_ax.take(left_indexer)
else:
jkey = join_keys[0]
if sort:
left_indexer, right_indexer = \
_get_single_indexer(jkey, right_ax, sort=sort)
join_index = left_ax.take(left_indexer)
else:
right_indexer = right_ax.get_indexer(jkey)

return join_index, left_indexer, right_indexer


_join_functions = {
'inner' : lib.inner_join,
'left' : lib.left_outer_join,
Expand Down
22 changes: 20 additions & 2 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,15 @@ def test_merge_index_singlekey_right_vs_left(self):
index=['d', 'b', 'c', 'a'])

merged1 = merge(left, right, left_on='key',
right_index=True, how='left')
right_index=True, how='left', sort=False)
merged2 = merge(right, left, right_on='key',
left_index=True, how='right')
left_index=True, how='right', sort=False)
assert_frame_equal(merged1, merged2.ix[:, merged1.columns])

merged1 = merge(left, right, left_on='key',
right_index=True, how='left', sort=True)
merged2 = merge(right, left, right_on='key',
left_index=True, how='right', sort=True)
assert_frame_equal(merged1, merged2.ix[:, merged1.columns])

def test_merge_index_singlekey_inner(self):
Expand Down Expand Up @@ -505,6 +511,18 @@ def test_merge_nocopy(self):
merged['d'] = 'peekaboo'
self.assert_((right['d'] == 'peekaboo').all())

def test_join_sort(self):
left = DataFrame({'key' : ['foo', 'bar', 'baz', 'foo'],
'value' : [1, 2, 3, 4]})
right = DataFrame({'value2' : ['a', 'b', 'c']},
index=['bar', 'baz', 'foo'])

joined = left.join(right, on='key', sort=True)
expected = DataFrame({'key' : ['bar', 'baz', 'foo', 'foo'],
'value' : [2, 3, 1, 4],
'value2' : ['a', 'b', 'c', 'c']},
index=[1, 2, 0, 3])
assert_frame_equal(joined, expected)

class TestMergeMulti(unittest.TestCase):

Expand Down
7 changes: 7 additions & 0 deletions vb_suite/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
columns=['A', 'B', 'C', 'D'])
df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2,
columns=['A', 'B', 'C', 'D'])
df_shuf = df.reindex(df.index[shuf])
"""

#----------------------------------------------------------------------
Expand All @@ -54,6 +56,11 @@
Benchmark("df.join(df_key2, on='key2')", setup,
name='join_dataframe_index_single_key_bigger')

join_dataframe_index_single_key_bigger_sort = \
Benchmark("df_shuf.join(df_key2, on='key2', sort=True)", setup,
name='join_dataframe_index_single_key_bigger',
start_date=datetime(2012, 2, 5))

join_dataframe_index_multi = \
Benchmark("df.join(df_multi, on=['key1', 'key2'])", setup,
name='join_dataframe_index_multi',
Expand Down

0 comments on commit 3cb301e

Please sign in to comment.