Skip to content

Commit

Permalink
ENH: first cuts on many-to-many joining, #249, #267
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Dec 29, 2011
1 parent c2ce803 commit 07f3914
Show file tree
Hide file tree
Showing 11 changed files with 936 additions and 565 deletions.
11 changes: 4 additions & 7 deletions TODO.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
LongPanel removal
=================

- DONE level to flex methods
- DONE level to reindex
- ?? fast take for items

Join methods todo
-----------------
- Joint factorizer
- NA group handling

DONE
----
Expand Down
36 changes: 36 additions & 0 deletions bench/bench_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pandas import *
import random

N = 10000
ngroups = 3

def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)

if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)

random.shuffle(arr)
return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
'key2' : get_test_data(ngroups=ngroups),
'data1' : np.random.randn(N),
'data2' : np.random.randn(N)})

df2 = DataFrame({'key1' : [0, 1, 2, 0, 1, 2],
'key2' : [0, 1, 2, 0, 1, 2],
'value' : list('abcdef')})


import pandas.tools.merge as merge
reload(merge)

left, right = merge._get_group_keys([df['key1'], df['key2']],
[df2['key1'], df2['key2']])

left, right = merge._get_group_keys([df['key1']], [df2['key1']])

3 changes: 2 additions & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1790,7 +1790,8 @@ def insert(self, loc, item):
new_levels.append(level)
new_labels.append(np.insert(labels, loc, lev_loc))

return MultiIndex(levels=new_levels, labels=new_labels, names=self.names)
return MultiIndex(levels=new_levels, labels=new_labels,
names=self.names)

def delete(self, loc):
"""
Expand Down
34 changes: 11 additions & 23 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1081,15 +1081,21 @@ def _union_items_slow(all_items):
return seen

def join_managers(left, right, axis=1, how='left', copy=True):
op = _JoinOperation(left, right, axis=axis, how=how)
join_index, left_indexer, right_indexer = \
left.axes[axis].join(right.axes[axis], how=how, return_indexers=True)
op = _JoinOperation(left, right, join_index, left_indexer,
right_indexer, axis=axis)
return op.get_result(copy=copy)

class _JoinOperation(object):
"""
Object responsible for orchestrating efficient join operation between two
BlockManager data structures
"""
def __init__(self, left, right, axis=1, how='left'):
def __init__(self, left, right, join_index, left_indexer, right_indexer,
axis=1):
assert(axis > 0)

if not left.is_consolidated():
left = left.consolidate()
if not right.is_consolidated():
Expand All @@ -1098,14 +1104,10 @@ def __init__(self, left, right, axis=1, how='left'):
self.left = left
self.right = right
self.axis = axis
self.how = how

laxis = left.axes[axis]
raxis = right.axes[axis]

(self.join_index,
self.lindexer,
self.rindexer) = laxis.join(raxis, how=how, return_indexers=True)
self.join_index = join_index
self.lindexer = left_indexer
self.rindexer = right_indexer

# do NOT sort
self.result_items = left.items.append(right.items)
Expand Down Expand Up @@ -1284,17 +1286,3 @@ def _upcast_blocks(blocks):

# use any ref_items
return _consolidate(new_blocks, newb.ref_items)

def _make_block_indexers(blocks, indexer, block_ids, block_locs, block_dtypes,
ref_items):
counts = defaultdict(int)
for dtype_name in block_dtypes.take(indexer):
counts[dtype_name] += 1

findexer = np.empty(counts['float64'], dtype='i4')
bindexer = np.empty(counts['bool'], dtype='i4')
oindexer = np.empty(counts['object'], dtype='i4')
iindexer = np.empty(counts['int64'], dtype='i4')

for idx in indexer:
pass
Loading

0 comments on commit 07f3914

Please sign in to comment.